presse/epub.php

702 lines
19 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0">
<title>Revue de presse - Epub</title>
<link rel="icon" type="image/png" href="1F4F0_color.png" /></head>
<body>
<?php
require 'settings.php';
require 'TPEpubCreator.php';
function get_clean_html($node)
{
// todo check words with "-" (like "sous-traitance")
$innerHTM = '';
foreach ($node->childNodes as $childNode){
$nodeHTML = $childNode->ownerDocument->saveHTML($childNode);
$nodeHTML = str_replace('<aside', '<b><i', $nodeHTML);
$nodeHTML = str_replace('</aside', '</i></b', $nodeHTML);
$nodeHTML = str_replace('<blockquote', '<b><i', $nodeHTML);
$nodeHTML = str_replace('</blockquote', '</i></b', $nodeHTML);
$innerHTML .= $nodeHTML;
}
return '<div>' . strip_tags($innerHTML, '<p><b><h2><i><strong>') . '</div>';
}
function add_mp_article($url, $context, $epub)
{
if (isset($_POST['mptoday']) && $_POST['mptoday'] && !str_contains($url, (new DateTime('today'))->format('dmy')))
{
return;
}
$article = file_get_contents($url, false, $context);
$doc = new DOMDocument();
$doc->loadHTML($article);
$finder = new DomXPath($doc);
$title = $finder->query('//h1')->item(0)->textContent;
$author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent;
$date = trim($finder->query('//time')->item(1)->textContent);
$summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent;
// strip images
$toremove = $finder->query('//svg');
foreach ($toremove as $elt)
{
$elt->parentNode->removeChild($elt);
}
$toremove = $finder->query('//figure');
foreach ($toremove as $elt)
{
$elt->parentNode->removeChild($elt);
}
$toremove = $finder->query('//span[@class="screen-reader-only"]');
foreach ($toremove as $elt)
{
$elt->parentNode->removeChild($elt);
}
$result = '<h1>' . $title . '</h1>';
$result .= '<p>' . $author . '</p>';
$result .= '<p>' . $date . '</p>';
$result .= '<p><b>' . $summary . '</b></p>';
$nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
if (!$nodes->length)
{
// articles accès libre
$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]');
}
if (!$nodes->length)
{
echo 'warning: could not get content of "' . $title . '"<br>';
}
else
{
$node = $nodes->item(0);
$nodehtml = get_clean_html($node);
$result .= $nodehtml;
// add boite noire
$nodes = $finder->query('//aside[contains(@class, "_black")]');
if ($nodes->length)
{
$node = $nodes->item(0);
$nodehtml = get_clean_html($node);
$result .= '<div>' . $nodehtml . '</div>';
}
$epub->AddPage($result, false, $title);
}
}
function get_mp_cookie()
{
global $mp_user;
global $mp_password;
$url = 'https://www.mediapart.fr/login_check';
$postfields = 'email=' . $mp_user . '&password=' . $mp_password . '&submitButton=&_target_path=https%3A%2F%2Fwww.mediapart.fr%2F';
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, false);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postfields);
$result = curl_exec($ch);
$sessid = '';
preg_match_all('/^Set-Cookie:\s*([^;]*)/mi', $result, $matches);
foreach($matches[1] as $item)
{
if (str_starts_with($item, 'MPSESSID='))
{
$sessid = str_replace('MPSESSID=', '', $item);
}
}
return $sessid;
}
function write_epub($epub)
{
if (file_exists($epub->epub_file))
{
unlink($epub->epub_file);
}
if (!$epub->error)
{
$epub->CreateEPUB();
if (!$epub->error)
{
echo 'Success: <a href="' . $epub->epub_file . '">' . $epub->epub_file . '</a> created.<br>';
}
else
{
echo $epub->error;
}
}
else
{
echo $epub->error;
}
}
function lm_download_image($base_url, $url_part, $id)
{
$image_url = preg_replace('/GetPublicationContentItems-.*\.json/', $url_part . $id . '.jpg', $base_url);
$temp_img_path = './temp/' . $id . '.jpg';
$temp_content = file_get_contents($image_url);
file_put_contents($temp_img_path, $temp_content);
return $temp_img_path;
}
function get_json($url)
{
// cache json in case url expires
$tempjsonpath = 'temp/' . hash('md5', $url) . '.json';
$json = '';
if (file_exists($tempjsonpath))
{
$json = file_get_contents($tempjsonpath);
}
else
{
$json = file_get_contents($url);
file_put_contents($tempjsonpath, $json);
}
return json_decode($json);
}
date_default_timezone_set('Europe/Paris');
$today = (new DateTime('today'))->format('Ymd');
if (isset($_POST['emptycache']) && $_POST['emptycache'])
{
$files = glob('temp/*');
foreach($files as $file)
{
if(is_file($file))
{
unlink($file);
}
}
}
// Cafeyn
if (isset($_POST['cafeyn']) && $_POST['cafeyn'])
{
$url = $_POST['cafeyn'];
// extract url from curl command
if (str_starts_with($url, 'curl '))
{
$url = explode("'", $url)[1];
echo '<p>extracted url from curl command:</p><p>' . $url . '</p>';
}
// cache json in case url expires
$publication = get_json($url);
$pubname = $publication->metadata->title;
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/' . $pubname . 'cafeyn.epub';
$epub->title = $pubname . ' Cafeyn - ' . $today ;
$list = (array)$publication->articles;
$page = 1;
foreach ($list as $entry)
{
$articleurl = str_replace('material', 'article/' . $entry->hash . '.json', $url);
$article = get_json($articleurl);
$content = '<h1>' . $article->title . '</h1>';
//$content .= '<b>' . $article->abstract . '</b>';
$content .= '<div>Page ' . $article->page . '</div>';
if (count($article->rubrics))
{
$content .= '<div>Rubriques: ' . implode(',', $article->rubrics) . '</div>';
}
if (count($article->authors))
{
$content .= '<div>Auteurs: ' . implode(',', $article->authors) . '</div>';
}
foreach ($article->content->sections as $section)
{
$dropcap = '';
foreach ($section->items as $item)
{
if ($item->type == 'text')
{
if ($item->class == 'quote')
{
$content .= '<blockquote>' .$item->content . '</blockquote>';
}
else if ($item->class == 'paragraphTitle')
{
$content .= '<h2>' .$item->content . '</h2>';
}
else if ($item->class == 'introduction')
{
$content .= '<p><b>' .$item->content . '</b></p>';
}
else if ($item->class == 'dropcap')
{
$dropcap = $item->content;
}
else
{
if ($dropcap)
{
$item->content = '<b>' . $dropcap . '</b>' . $item->content;
$dropcap = '';
}
$content .= '<p>' . $item->content . '</p>';
}
}
}
}
while ($article->page >= $page)
{
$path = $publication->pages[$page - 1]->tn;
$path = str_replace('material', $path, $url);
$path = str_replace('api.', 'content.', $path);
if ($page == 1)
{
// cover
$epub->AddImage($path, 'image/jpeg', true);
}
else
{
$epub->AddPage('<img style="width: 100%" src="' . $path . '">', false, 'Page ' . $page, true);
}
$page++;
}
$epub->AddPage($content, false, $article->title);
}
write_epub($epub);
}
// Le Monde
if (isset($_POST['lemonde']) && $_POST['lemonde'])
{
$url = $_POST['lemonde'];
// extract url from curl command
if (str_starts_with($url, 'curl '))
{
$url = explode("'", $url)[1];
echo '<p>extracted url from curl command:</p><p>' . $url . '</p>';
}
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/lemonde.epub';
$epub->title = 'Le Monde ' . $today;
$publication = get_json($url);
$content = array_filter($publication->Content, function($item) { return $item->Category == 'Le Monde'; });
usort($content, function ($a, $b) { return $a->PageNumber - $b->PageNumber; });
$pageindex = get_json(str_replace('GetPublicationContentItems', 'GetPublicationPages', $url));
$page = 0;
foreach ($content as $article)
{
if ($article->PageNumber > $page)
{
$page = $article->PageNumber;
$pageobj = array_values(array_filter($pageindex->Page, function($p)
{
global $page;
return $page == $p->PageNumber;
}))[0];
$pageid = $pageobj->PublicationPageID;
$path = lm_download_image($url, 'Preview-MEDIUM-', $pageid);
if ($page == 1)
{
// cover
$epub->AddImage($path, 'image/jpeg', true);
}
else
{
$epub->AddPage('<img style="width: 100%" src="' . $path . '">', false, 'Page ' . $page, true);
}
}
$articlebody = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'text/xml'; });
$articlebody = array_values($articlebody)[0];
if ($articlebody->Title && $articlebody->HtmlText)
{
$pagecontent = '<h1>'. strip_tags($articlebody->Title) . '</h1>';
$pagecontent .= '<p>Page ' . $article->PageNumber . '</p>';
$author = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'author/xml'; });
$author = array_values($author)[0];
if ($author->Author)
{
$pagecontent .= $author->Author;
}
if ($articlebody->Introduction)
{
$pagecontent .= '<b>' . $articlebody->Introduction . '</b>';
}
$pagecontent .= $articlebody->HtmlText;
$epub->AddPage($pagecontent, false, str_replace('&nbsp;', ' ', strip_tags($articlebody->Title)), true);
}
}
write_epub($epub);
}
if (isset($_POST['forcempcookies']) && $_POST['forcempcookies'])
{
$old = $mp_sessionid;
$mp_sessionid = get_mp_cookie();
$settings = file_get_contents('settings.php');
$settings = str_replace($old, $mp_sessionid, $settings);
file_put_contents('settings.php', $settings);
echo '<div>Updated mpsessid in settings.</div>';
}
// Mediapart RSS
if (isset($_POST['mediapart']) && $_POST['mediapart'])
{
$feedurl = 'https://www.mediapart.fr/articles/feed';
$opts = [
'http' => [
'method' => "GET",
'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid,
]
];
$context = stream_context_create($opts);
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/mediapart.epub';
$epub->title = 'Mediapart - RSS ' . $today;
// load feeds
$feed = file_get_contents($feedurl);
$xml = new SimpleXMLElement($feed);
$items = $xml->xpath("/rss/channel/item");
foreach (array_reverse($items) as $item)
{
add_mp_article($item->link, $context, $epub);
}
write_epub($epub);
}
// Politis RSS
if (isset($_POST['politis']) && $_POST['politis'])
{
$issuenb = $_POST['politis'];
$feedurl = 'https://www.politis.fr/flux-rss-politis-fr/';
$opts = [
'http' => [
'method' => "GET",
//'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, // todo?
]
];
$context = stream_context_create($opts);
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/politis' . $issuenb . '.epub';
$epub->title = 'Politis ' . $issuenb . ' - RSS ' . $today;
// find cover
$issues = file_get_contents('https://www.politis.fr/boutique/magazines-unite/');
$doc = new DOMDocument();
$doc->loadHTML($issues);
$finder = new DomXPath($doc);
$cover = $finder->query('//img[contains(@src, "' . $issuenb . '_")]')->item(0)->getAttribute('src');
$epub->AddImage($cover, 'image/jpeg', true);
// load feeds
$feed = file_get_contents($feedurl);
$xml = new SimpleXMLElement($feed);
$items = $xml->xpath("/rss/channel/item");
foreach (array_reverse($items) as $item)
{
$url = $item->link;
$article = file_get_contents($url, false, $context);
if (!str_contains($article, ' dans lhebdo N° ' . $issuenb))
{
continue;
}
$doc = new DOMDocument();
$doc->loadHTML($article);
$finder = new DomXPath($doc);
$title = $finder->query('//h1')->item(0)->textContent;
$author = $finder->query('//span[@class="nom"]')->item(0)->textContent;
$date = $finder->query('//span[@class="date"]')->item(0)->textContent;
$summary = $finder->query('//div[@class="extrait"]')->item(0)->textContent;
$result = '<h1>' . $title . '</h1>';
$result .= '<p>' . $author . '</p>';
$result .= '<p>' . $date . '</p>';
$result .= '<p><b>' . $summary . '</b></p>';
$nodes = $finder->query('//div[@id="content"]');
$node = $nodes->item(0);
$nodehtml = get_clean_html($node);
$result .= $nodehtml;
$epub->AddPage($result, false, $title);
}
write_epub($epub);
}
// Mediapart home page
if (isset($_POST['mphome']) && $_POST['mphome'])
{
$homeurl = 'https://www.mediapart.fr';
$opts = [
'http' => [
'method' => "GET",
'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid,
]
];
$context = stream_context_create($opts);
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/mediaparthome.epub';
$epub->title = 'Mediapart - A la une ' . $today;
$home = file_get_contents($homeurl, false, $context);
$doc = new DOMDocument();
$doc->loadHTML($home);
$finder = new DomXPath($doc);
$xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href';
$links = $finder->query($xpath);
foreach ($links as $link)
{
add_mp_article('https://www.mediapart.fr' . $link->value, $context, $epub);
}
write_epub($epub);
}
// New York Times RSS
if (isset($_POST['nyt']) && $_POST['nyt'])
{
$feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml';
$opts = [
'http' => [
'method' => "GET",
'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid,
]
];
$context = stream_context_create($opts);
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/newyorktimes.epub';
$epub->title = 'The New York Times ' . $today;
// load feeds
$feed = file_get_contents($feedurl);
$xml = new SimpleXMLElement($feed);
$items = $xml->xpath("/rss/channel/item");
foreach (array_reverse($items) as $item)
{
$title = $item->title;
$author = $item->xpath('dc:creator')[0];
$summary = $item->description;
$article = file_get_contents($item->link, false, $context);
$doc = new DOMDocument();
$doc->loadHTML($article);
$finder = new DomXPath($doc);
$result = '<h1>' . $title . '</h1>';
$result .= '<p>' . $author . '</p>';
$result .= '<p>' . $item->pubDate . '</p>';
$result .= '<p><b>' . $summary . '</b></p>';
$nodes = $finder->query('//section[@name="articleBody"]');
if (!$nodes->length)
{
echo 'warning: could not get content of "' . $title . '"<br>';
}
else
{
$node = $nodes->item(0);
$nodehtml = get_clean_html($node);
$result .= $nodehtml;
$epub->AddPage($result, false, $title);
}
}
write_epub($epub);
}
// New York Times today's paper
if (isset($_POST['nyttoday']) && $_POST['nyttoday'])
{
$todaypage = file_get_contents("https://www.nytimes.com/section/todayspaper");
$opts = [
'http' => [
'method' => "GET",
'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid,
]
];
$context = stream_context_create($opts);
$epub = new TPEpubCreator();
$epub->temp_folder = 'temp/';
$epub->epub_file = 'epub/newyorktimestoday.epub';
$epub->title = 'The New York Times today\'s paper ' . $today;
// grab today's paper
$start = strpos($todaypage, "window.__preloadedData = ") + 25;
$end = strpos($todaypage, "};", $start) + 1;
$json = str_replace(":undefined", ":null", substr($todaypage, $start, $end - $start));
$data = json_decode($json);
foreach($data->initialState as $item)
{
if ($item->__typename == "Article")
{
$headline = ($data->initialState->{ $item->headline->id });
$title = $headline->default;
$summary = $item->summary;
$result = '<h1>' . $title . '</h1>';
// authors
$result .= '<p>';
$creators = ($data->initialState->{ $item->bylines[0]->id })->creators;
foreach ($creators as $creator)
{
$author = $data->initialState->{ $creator->id };
$result .= $author->displayName . '&nbsp;';
}
$result .= '</p>';
$result .= '<p>' . $item->lastMajorModification . '</p>';
$result .= '<p><b>' . $summary . '</b></p>';
$article = file_get_contents($item->url, false, $context);
$doc = new DOMDocument();
$doc->loadHTML($article);
$finder = new DomXPath($doc);
$nodes = $finder->query('//section[@name="articleBody"]');
if (!$nodes->length)
{
echo 'warning: could not get content of "' . $title . '"<br>';
}
else
{
$node = $nodes->item(0);
$nodehtml = get_clean_html($node);
$result .= $nodehtml;
$epub->AddPage($result, false, $title);
}
}
}
write_epub($epub);
}
?>
<h1>Generate epub</h1>
<div>
<form method="post">
<input type="submit">
<h2>Mediapart</h2>
<input id="mediapart" name="mediapart" type="checkbox">
<label for="mediapart">From RSS</label>
<br>
<input id="mphome" name="mphome" type="checkbox">
<label for="mphome">From home page</label>
<br>
<input id="forcempcookies" name="forcempcookies" type="checkbox">
<label for="forcempcookies">Force cookies</label>
<br>
<input id="mptoday" name="mptoday" type="checkbox">
<label for="mptoday">Today only</label>
<h2>Le Monde</h2>
<label for="lemonde">"GetPublicationContentItems" json url: </label>
<input id="lemonde" name="lemonde">
<h2>Cafeyn</h2>
<label for="cafeyn">"material" json url: </label>
<input id="cafeyn" name="cafeyn">
<h2>The New York Times</h2>
<input id="nyt" name="nyt" type="checkbox">
<label for="nyt">From RSS</label>
<br>
<input id="nyttoday" name="nyttoday" type="checkbox">
<label for="nyttoday">From today's paper embedded script</label>
<h2>Politis</h2>
<label for="politis">Issue number: </label><input id="politis" name="politis">
<h2>Admin</h2>
<input id="emptycache" name="emptycache" type="checkbox" checked>
<label for="emptycache">Empty cache</label>
<br><br>
<input type="submit">
</form>
</div>
<h1>Links</h1>
<a href="../epub">Fichiers générés</a><br>
<a href="../">Revue de presse</a><br>
<h1>Mediapart RSS summary</h1>
<?php
$mprss = file_get_contents('https://www.mediapart.fr/articles/feed', false, $context);
$xml = new SimpleXMLElement($mprss);
$items = $xml->xpath("/rss/channel/item");
foreach ($items as $item)
{
echo '<div><a href="'.$item->link.'">'.$item->pubDate. ' '.$item->title.'</a></div>';
}
?>
</body>
</html>