<!DOCTYPE html> <html> <head> <title>Fichiers epub</title> <link rel="icon" type="image/png" href="1F4F0_color.png" /> </head> <body> <?php require 'settings.php'; require 'TPEpubCreator.php'; function write_epub($epub) { if ( ! $epub->error ) { $epub->CreateEPUB(); if ( ! $epub->error ) { echo 'Success: ' . $epub->epub_file . ' created.<br>'; } else { echo $epub->error; } } else { echo $epub->error; } } date_default_timezone_set('Europe/Paris'); $date = (new DateTime('today'))->format('Ymd'); // Le Monde if (isset($_POST['lemonde']) && $_POST['lemonde']) { $url = $_POST['lmurl']; // extract url from curl command if (str_starts_with($url, 'curl ')) { $url = explode("'", $url)[1]; echo '<p>extracted url from curl command:</p><p>' . $url . '</p>'; } $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/lemonde.epub'; $epub->title = 'Le Monde ' . $date ; if ($lm_includecover) { // todo get correct cover according to date and time $coverurl = 'https://www.lemonde.fr/thumbnail/journal/'. $date .'/1000/1490'; $epub->AddImage( $coverurl, 'image/jpeg', true ); } // cache json in case url expires $tempjsonpath = 'temp/' . hash('md5', $url) . '.json'; $json = ''; if (file_exists($tempjsonpath)) { $json = file_get_contents($tempjsonpath); } else { $json = file_get_contents($url); file_put_contents($tempjsonpath, $json); } $publication = json_decode($json); $content = array_filter($publication->Content, function($item) { return $item->Category == 'Le Monde'; }); usort($content, function ($a, $b) { return $a->PageNumber - $b->PageNumber; }); foreach ($content as $article) { $articlebody = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'text/xml'; }); $articlebody = array_values($articlebody)[0]; if ($articlebody->Title && $articlebody->HtmlText) { $pagecontent = '<h1>'. strip_tags($articlebody->Title) . '</h1>'; $pagecontent .= '<p>Page ' . $article->PageNumber . '</p>'; $author = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'author/xml'; }); $author = array_values($author)[0]; if ($author->Author) { $pagecontent .= $author->Author; } if ($articlebody->Introduction) { $pagecontent .= '<b>' . $articlebody->Introduction . '</b>'; } if ($lm_includeimages) { $images = array_values(array_filter($article->ContentItem, function($item) { return $item->ContentType == 'graphic/jpeg' || $item->ContentType == 'image/jpeg'; })); foreach ($images as $image) { $imageid = $image->ContentItemId; $imageurl = preg_replace('/GetPublicationContentItems-.*\.json/', 'Image-MEDIUM-' . $imageid . '.jpg', $url); $tempcontent = file_get_contents($imageurl); file_put_contents('temp/' . $imageid, $tempcontent); //$epub->AddImage('temp/' . $imageid, false, false ); $pagecontent .= '<div><img src="' . $imageurl . '"></div>'; if ($image->HtmlText) { $pagecontent .= $image->HtmlText; } } } $pagecontent .= $articlebody->HtmlText; $epub->AddPage($pagecontent, false, strip_tags($articlebody->Title)); } } write_epub($epub); } // Mediapart if (isset($_POST['mediapart']) && $_POST['mediapart']) { $feedurl = 'https://www.mediapart.fr/articles/feed'; $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/mediapart.epub'; $epub->title = 'Mediapart ' . $date; // load feeds $feed = file_get_contents($feedurl); $xml = new SimpleXMLElement($feed); $items = $xml->xpath("/rss/channel/item"); foreach ($items as $item) { $title = $item->title; $category = $item->xpath('dc:subject')[0]; $author = $item->xpath('dc:creator')[0]; $summary = $item->description; $article = file_get_contents($item->link, false, $context); $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); // strip images $toremove = $finder->query('//svg'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//figure'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//span[@class="screen-reader-only"]'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $result = '<h1>' . $title . '</h1>'; $result .= '<p>' . $author . '</p>'; $result .= '<p>' . $item->pubDate . '</p>'; $result .= '<p><b>' . $summary . '</b></p>'; $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); if (!$nodes->length) { // articles accès libre $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); } if (!$nodes->length) { echo 'warning: could not get content of "' . $title . '"<br>'; } else { $node = $nodes->item(0); $innerHTML = ''; foreach ($node->childNodes as $childNode){ $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); } $outerHTML = $node->ownerDocument->saveHTML($node); $textcontent = $node->textContent; $result .= '<div>' . strip_tags($innerHTML, '<p><b><h2><i>') . '</div>'; $epub->AddPage($result, false, $title); } } write_epub($epub); } // New York Times if (isset($_POST['nyt']) && $_POST['nyt']) { $feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'; $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/newyorktimes.epub'; $epub->title = 'The New York Times ' . $date; // load feeds $feed = file_get_contents($feedurl); $xml = new SimpleXMLElement($feed); $items = $xml->xpath("/rss/channel/item"); foreach ($items as $item) { $title = $item->title; $category = $item->xpath('dc:subject')[0]; $author = $item->xpath('dc:creator')[0]; $summary = $item->description; $article = file_get_contents($item->link, false, $context); $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); // strip images /*$toremove = $finder->query('//svg'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//figure'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//span[@class="screen-reader-only"]'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); }*/ $result = '<h1>' . $title . '</h1>'; $result .= '<p>' . $author . '</p>'; $result .= '<p>' . $item->pubDate . '</p>'; $result .= '<p><b>' . $summary . '</b></p>'; $nodes = $finder->query('//section[@name="articleBody"]'); /*if (!$nodes->length) { // articles accès libre $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); }*/ if (!$nodes->length) { echo 'warning: could not get content of "' . $title . '"<br>'; } else { $node = $nodes->item(0); $innerHTML = ''; foreach ($node->childNodes as $childNode){ $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); } $outerHTML = $node->ownerDocument->saveHTML($node); $textcontent = $node->textContent; $result .= '<div>' . strip_tags($innerHTML, '<p><b><h2><i>') . '</div>'; $epub->AddPage($result, false, $title); } } write_epub($epub); } // list existing files echo '<div>'; $files = glob('epub/*'); foreach ($files as $file) { echo '<a href="' . $file . '">' . str_replace('epub/', '', $file) . '</a> ' . date('F d Y H:i:s', filemtime($file)) . '<br>'; } echo '</div>'; ?> <div> Generate epub:<br> <form method="post"> <input input name="lemonde" type="checkbox">Le Monde. GetPublicationContentItems url: <input name="lmurl"><br> <input input name="mediapart" type="checkbox">Mediapart.<br> <input input name="nyt" type="checkbox">The New York Times.<br> <input type="submit"> </form> </div> </body> </html>