2024-06-04 09:42:41 +02:00
<! DOCTYPE html >
2024-05-27 13:28:36 +02:00
< html >
2024-06-04 09:42:41 +02:00
< head >
2024-06-04 13:51:27 +02:00
< meta charset = " utf-8 " >
< meta name = " viewport " content = " width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0 " >
< title > Revue de presse - Epub </ title >
< link rel = " icon " type = " image/png " href = " 1F4F0_color.png " /></ head >
2024-05-27 13:28:36 +02:00
< body >
< ? php
2024-05-28 16:13:52 +02:00
2024-06-03 17:03:19 +02:00
require 'settings.php' ;
2024-06-03 17:29:30 +02:00
require 'TPEpubCreator.php' ;
2024-06-03 17:03:19 +02:00
2024-08-28 15:57:56 +02:00
function get_clean_html ( $node )
{
2024-09-05 14:07:27 +02:00
// todo check words with "-" (like "sous-traitance")
2024-08-28 15:57:56 +02:00
$innerHTM = '' ;
foreach ( $node -> childNodes as $childNode ){
$nodeHTML = $childNode -> ownerDocument -> saveHTML ( $childNode );
2024-09-04 18:20:01 +02:00
2024-08-28 15:57:56 +02:00
$nodeHTML = str_replace ( '<aside' , '<b><i' , $nodeHTML );
$nodeHTML = str_replace ( '</aside' , '</i></b' , $nodeHTML );
2024-09-04 18:20:01 +02:00
$nodeHTML = str_replace ( '<blockquote' , '<b><i' , $nodeHTML );
$nodeHTML = str_replace ( '</blockquote' , '</i></b' , $nodeHTML );
2024-08-28 15:57:56 +02:00
$innerHTML .= $nodeHTML ;
}
2024-09-04 17:43:37 +02:00
return '<div>' . strip_tags ( $innerHTML , '<p><b><h2><i><strong>' ) . '</div>' ;
2024-08-28 15:57:56 +02:00
}
2024-08-26 18:10:14 +02:00
function add_mp_article ( $url , $context , $epub )
{
2024-08-27 10:40:26 +02:00
if ( isset ( $_POST [ 'mptoday' ]) && $_POST [ 'mptoday' ] && ! str_contains ( $url , ( new DateTime ( 'today' )) -> format ( 'dmy' )))
{
return ;
}
2024-08-26 18:15:17 +02:00
$article = file_get_contents ( $url , false , $context );
$doc = new DOMDocument ();
$doc -> loadHTML ( $article );
$finder = new DomXPath ( $doc );
$title = $finder -> query ( '//h1' ) -> item ( 0 ) -> textContent ;
$author = $finder -> query ( '//a[contains(@href, "biographie")]' ) -> item ( 0 ) -> textContent ;
2024-08-27 10:40:26 +02:00
$date = trim ( $finder -> query ( '//time' ) -> item ( 1 ) -> textContent );
2024-08-26 18:15:17 +02:00
$summary = $finder -> query ( '//p[contains(@class, "news__heading__top__intro")]' ) -> item ( 0 ) -> textContent ;
2024-09-02 18:22:50 +02:00
// strip images
$toremove = $finder -> query ( '//svg' );
foreach ( $toremove as $elt )
{
$elt -> parentNode -> removeChild ( $elt );
}
$toremove = $finder -> query ( '//figure' );
foreach ( $toremove as $elt )
{
$elt -> parentNode -> removeChild ( $elt );
}
$toremove = $finder -> query ( '//span[@class="screen-reader-only"]' );
foreach ( $toremove as $elt )
{
$elt -> parentNode -> removeChild ( $elt );
}
2024-08-26 18:15:17 +02:00
$result = '<h1>' . $title . '</h1>' ;
$result .= '<p>' . $author . '</p>' ;
$result .= '<p>' . $date . '</p>' ;
$result .= '<p><b>' . $summary . '</b></p>' ;
$nodes = $finder -> query ( '//div[contains(@class, "paywall-restricted-content")]' );
if ( ! $nodes -> length )
{
// articles accès libre
$nodes = $finder -> query ( '//div[contains(@class, "news__body__center__article")]' );
}
2024-09-05 14:07:27 +02:00
// todo add "boite noire"
2024-08-26 18:15:17 +02:00
if ( ! $nodes -> length )
{
echo 'warning: could not get content of "' . $title . '"<br>' ;
}
else
{
$node = $nodes -> item ( 0 );
2024-08-28 15:57:56 +02:00
$nodehtml = get_clean_html ( $node );
2024-08-28 16:55:33 +02:00
$result .= $nodehtml ;
$epub -> AddPage ( $result , false , $title );
2024-08-26 18:15:17 +02:00
}
2024-08-26 18:10:14 +02:00
}
2024-07-07 15:01:30 +02:00
function get_mp_cookie ()
{
global $mp_user ;
global $mp_password ;
$url = 'https://www.mediapart.fr/login_check' ;
$postfields = 'email=' . $mp_user . '&password=' . $mp_password . '&submitButton=&_target_path=https%3A%2F%2Fwww.mediapart.fr%2F' ;
$ch = curl_init ( $url );
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 );
curl_setopt ( $ch , CURLOPT_FOLLOWLOCATION , true );
curl_setopt ( $ch , CURLOPT_AUTOREFERER , false );
curl_setopt ( $ch , CURLOPT_HEADER , 1 );
curl_setopt ( $ch , CURLOPT_POST , 1 );
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $postfields );
$result = curl_exec ( $ch );
$sessid = '' ;
preg_match_all ( '/^Set-Cookie:\s*([^;]*)/mi' , $result , $matches );
foreach ( $matches [ 1 ] as $item )
{
if ( str_starts_with ( $item , 'MPSESSID=' ))
{
$sessid = str_replace ( 'MPSESSID=' , '' , $item );
}
}
return $sessid ;
}
2024-06-04 09:42:41 +02:00
function write_epub ( $epub )
{
2024-06-12 10:54:32 +02:00
if ( file_exists ( $epub -> epub_file ))
{
unlink ( $epub -> epub_file );
}
if ( ! $epub -> error )
{
2024-06-04 09:42:41 +02:00
$epub -> CreateEPUB ();
2024-06-12 10:54:32 +02:00
if ( ! $epub -> error )
{
2024-06-06 10:08:33 +02:00
echo 'Success: <a href="' . $epub -> epub_file . '">' . $epub -> epub_file . '</a> created.<br>' ;
2024-06-04 09:42:41 +02:00
}
else
{
echo $epub -> error ;
}
2024-06-12 10:54:32 +02:00
}
else
{
2024-06-04 09:42:41 +02:00
echo $epub -> error ;
}
}
2024-06-13 09:42:57 +02:00
function lm_download_image ( $base_url , $url_part , $id )
{
$image_url = preg_replace ( '/GetPublicationContentItems-.*\.json/' , $url_part . $id . '.jpg' , $base_url );
$temp_img_path = './temp/' . $id . '.jpg' ;
$temp_content = file_get_contents ( $image_url );
file_put_contents ( $temp_img_path , $temp_content );
return $temp_img_path ;
}
2024-07-09 16:42:49 +02:00
function get_json ( $url )
{
// cache json in case url expires
$tempjsonpath = 'temp/' . hash ( 'md5' , $url ) . '.json' ;
$json = '' ;
if ( file_exists ( $tempjsonpath ))
{
$json = file_get_contents ( $tempjsonpath );
}
else
{
$json = file_get_contents ( $url );
file_put_contents ( $tempjsonpath , $json );
}
return json_decode ( $json );
}
2024-06-04 09:21:33 +02:00
date_default_timezone_set ( 'Europe/Paris' );
2024-08-29 14:44:35 +02:00
$today = ( new DateTime ( 'today' )) -> format ( 'Ymd' );
2024-05-28 16:13:52 +02:00
2024-07-09 17:14:54 +02:00
if ( isset ( $_POST [ 'emptycache' ]) && $_POST [ 'emptycache' ])
{
$files = glob ( 'temp/*' );
foreach ( $files as $file )
{
if ( is_file ( $file ))
{
unlink ( $file );
}
}
}
2024-07-08 11:47:56 +02:00
// Cafeyn
if ( isset ( $_POST [ 'cafeyn' ]) && $_POST [ 'cafeyn' ])
2024-07-07 15:12:56 +02:00
{
2024-09-04 18:20:11 +02:00
$url = $_POST [ 'cafeyn' ];
2024-07-07 15:12:56 +02:00
// extract url from curl command
if ( str_starts_with ( $url , 'curl ' ))
{
$url = explode ( " ' " , $url )[ 1 ];
echo '<p>extracted url from curl command:</p><p>' . $url . '</p>' ;
}
// cache json in case url expires
2024-07-09 16:42:49 +02:00
$publication = get_json ( $url );
2024-07-08 11:47:56 +02:00
$pubname = $publication -> metadata -> title ;
2024-07-07 15:12:56 +02:00
2024-07-08 11:47:56 +02:00
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
2024-09-05 14:07:45 +02:00
$epub -> epub_file = 'epub/' . $pubname . 'cafeyn.epub' ;
$epub -> title = $pubname . ' Cafeyn - ' . $today ;
2024-07-07 15:12:56 +02:00
2024-07-08 11:47:56 +02:00
$list = ( array ) $publication -> articles ;
2024-07-08 16:48:19 +02:00
$page = 1 ;
2024-07-08 11:47:56 +02:00
foreach ( $list as $entry )
2024-07-07 15:12:56 +02:00
{
2024-07-08 11:47:56 +02:00
$articleurl = str_replace ( 'material' , 'article/' . $entry -> hash . '.json' , $url );
2024-07-09 16:42:49 +02:00
$article = get_json ( $articleurl );
2024-07-08 15:07:27 +02:00
$content = '<h1>' . $article -> title . '</h1>' ;
2024-07-09 14:57:24 +02:00
//$content .= '<b>' . $article->abstract . '</b>';
2024-07-08 16:48:19 +02:00
$content .= '<div>Page ' . $article -> page . '</div>' ;
2024-08-22 12:08:39 +02:00
2024-07-08 15:07:27 +02:00
if ( count ( $article -> rubrics ))
{
$content .= '<div>Rubriques: ' . implode ( ',' , $article -> rubrics ) . '</div>' ;
}
if ( count ( $article -> authors ))
{
$content .= '<div>Auteurs: ' . implode ( ',' , $article -> authors ) . '</div>' ;
}
foreach ( $article -> content -> sections as $section )
{
2024-08-30 11:33:59 +02:00
$dropcap = '' ;
2024-07-08 15:07:27 +02:00
foreach ( $section -> items as $item )
{
if ( $item -> type == 'text' )
{
if ( $item -> class == 'quote' )
{
2024-08-22 12:08:39 +02:00
$content .= '<blockquote>' . $item -> content . '</blockquote>' ;
2024-07-08 15:07:27 +02:00
}
else if ( $item -> class == 'paragraphTitle' )
{
2024-08-22 12:08:39 +02:00
$content .= '<h2>' . $item -> content . '</h2>' ;
2024-07-08 15:07:27 +02:00
}
2024-07-09 14:57:24 +02:00
else if ( $item -> class == 'introduction' )
{
2024-08-22 12:08:39 +02:00
$content .= '<p><b>' . $item -> content . '</b></p>' ;
2024-07-09 14:57:24 +02:00
}
2024-08-30 11:33:59 +02:00
else if ( $item -> class == 'dropcap' )
{
$dropcap = $item -> content ;
}
2024-07-08 15:07:27 +02:00
else
{
2024-08-30 11:33:59 +02:00
if ( $dropcap )
{
$item -> content = '<b>' . $dropcap . '</b>' . $item -> content ;
$dropcap = '' ;
}
$content .= '<p>' . $item -> content . '</p>' ;
2024-07-08 15:07:27 +02:00
}
}
}
}
2024-07-08 16:48:19 +02:00
while ( $article -> page >= $page )
2024-07-08 15:07:27 +02:00
{
2024-07-08 16:48:19 +02:00
$path = $publication -> pages [ $page - 1 ] -> tn ;
$path = str_replace ( 'material' , $path , $url );
$path = str_replace ( 'api.' , 'content.' , $path );
if ( $page == 1 )
{
// cover
$epub -> AddImage ( $path , 'image/jpeg' , true );
}
else
{
$epub -> AddPage ( '<img style="width: 100%" src="' . $path . '">' , false , 'Page ' . $page , true );
2024-08-22 12:08:39 +02:00
}
2024-07-08 16:48:19 +02:00
$page ++ ;
}
2024-07-08 15:07:27 +02:00
$epub -> AddPage ( $content , false , $article -> title );
2024-08-22 12:08:39 +02:00
2024-07-07 15:12:56 +02:00
}
write_epub ( $epub );
2024-07-08 11:47:56 +02:00
}
2024-07-07 15:12:56 +02:00
2024-06-03 17:29:30 +02:00
// Le Monde
if ( isset ( $_POST [ 'lemonde' ]) && $_POST [ 'lemonde' ])
{
2024-09-04 18:20:11 +02:00
$url = $_POST [ 'lemonde' ];
2024-05-28 16:13:52 +02:00
// extract url from curl command
if ( str_starts_with ( $url , 'curl ' ))
{
$url = explode ( " ' " , $url )[ 1 ];
echo '<p>extracted url from curl command:</p><p>' . $url . '</p>' ;
}
2024-05-27 13:28:36 +02:00
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
2024-06-03 21:28:52 +02:00
$epub -> epub_file = 'epub/lemonde.epub' ;
2024-08-29 14:44:35 +02:00
$epub -> title = 'Le Monde ' . $today ;
2024-05-28 16:13:52 +02:00
2024-07-09 16:50:16 +02:00
$publication = get_json ( $url );
2024-05-27 13:28:36 +02:00
$content = array_filter ( $publication -> Content , function ( $item ) { return $item -> Category == 'Le Monde' ; });
usort ( $content , function ( $a , $b ) { return $a -> PageNumber - $b -> PageNumber ; });
2024-05-28 16:13:52 +02:00
2024-07-09 16:50:16 +02:00
$pageindex = get_json ( str_replace ( 'GetPublicationContentItems' , 'GetPublicationPages' , $url ));
2024-06-12 10:54:18 +02:00
$page = 0 ;
2024-05-27 13:28:36 +02:00
foreach ( $content as $article )
{
2024-07-08 11:50:57 +02:00
if ( $article -> PageNumber > $page )
2024-06-12 10:54:18 +02:00
{
$page = $article -> PageNumber ;
$pageobj = array_values ( array_filter ( $pageindex -> Page , function ( $p )
{
global $page ;
return $page == $p -> PageNumber ;
}))[ 0 ];
$pageid = $pageobj -> PublicationPageID ;
2024-07-08 11:46:51 +02:00
$path = lm_download_image ( $url , 'Preview-MEDIUM-' , $pageid );
2024-06-12 10:54:18 +02:00
if ( $page == 1 )
{
2024-06-14 12:21:54 +02:00
// cover
2024-06-13 09:42:57 +02:00
$epub -> AddImage ( $path , 'image/jpeg' , true );
2024-06-12 10:54:18 +02:00
}
2024-06-14 12:21:54 +02:00
else
{
$epub -> AddPage ( '<img style="width: 100%" src="' . $path . '">' , false , 'Page ' . $page , true );
}
2024-06-12 10:54:18 +02:00
}
2024-05-29 21:35:13 +02:00
$articlebody = array_filter ( $article -> ContentItem , function ( $item ) { return $item -> ContentType == 'text/xml' ; });
$articlebody = array_values ( $articlebody )[ 0 ];
2024-05-27 13:28:36 +02:00
if ( $articlebody -> Title && $articlebody -> HtmlText )
{
$pagecontent = '<h1>' . strip_tags ( $articlebody -> Title ) . '</h1>' ;
2024-06-03 17:03:19 +02:00
$pagecontent .= '<p>Page ' . $article -> PageNumber . '</p>' ;
2024-05-29 21:35:13 +02:00
$author = array_filter ( $article -> ContentItem , function ( $item ) { return $item -> ContentType == 'author/xml' ; });
$author = array_values ( $author )[ 0 ];
if ( $author -> Author )
{
$pagecontent .= $author -> Author ;
}
2024-05-27 13:28:36 +02:00
if ( $articlebody -> Introduction )
{
$pagecontent .= '<b>' . $articlebody -> Introduction . '</b>' ;
}
$pagecontent .= $articlebody -> HtmlText ;
2024-06-12 10:54:18 +02:00
2024-07-09 14:57:24 +02:00
$epub -> AddPage ( $pagecontent , false , str_replace ( ' ' , ' ' , strip_tags ( $articlebody -> Title )), true );
2024-05-27 13:28:36 +02:00
}
}
2024-06-04 09:42:41 +02:00
write_epub ( $epub );
2024-05-27 13:28:36 +02:00
}
2024-05-28 16:13:52 +02:00
2024-08-22 12:08:39 +02:00
if ( isset ( $_POST [ 'forcempcookies' ]) && $_POST [ 'forcempcookies' ])
2024-06-03 17:38:46 +02:00
{
2024-08-22 12:08:39 +02:00
$old = $mp_sessionid ;
$mp_sessionid = get_mp_cookie ();
$settings = file_get_contents ( 'settings.php' );
$settings = str_replace ( $old , $mp_sessionid , $settings );
file_put_contents ( 'settings.php' , $settings );
echo '<div>Updated mpsessid in settings.</div>' ;
}
2024-07-07 15:01:30 +02:00
2024-08-22 12:08:39 +02:00
// Mediapart RSS
if ( isset ( $_POST [ 'mediapart' ]) && $_POST [ 'mediapart' ])
{
2024-06-03 17:38:46 +02:00
$feedurl = 'https://www.mediapart.fr/articles/feed' ;
$opts = [
'http' => [
'method' => " GET " ,
2024-06-03 21:28:52 +02:00
'header' => " Accept-language: en \n Cookie: MPSESSID= " . $mp_sessionid ,
2024-06-03 17:38:46 +02:00
]
];
$context = stream_context_create ( $opts );
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
2024-06-03 21:28:52 +02:00
$epub -> epub_file = 'epub/mediapart.epub' ;
2024-08-29 14:44:35 +02:00
$epub -> title = 'Mediapart - RSS ' . $today ;
2024-06-03 17:38:46 +02:00
// load feeds
$feed = file_get_contents ( $feedurl );
$xml = new SimpleXMLElement ( $feed );
$items = $xml -> xpath ( " /rss/channel/item " );
foreach ( $items as $item )
{
2024-08-26 18:15:17 +02:00
add_mp_article ( $item -> link , $context , $epub );
2024-06-03 17:38:46 +02:00
}
2024-06-04 09:42:41 +02:00
write_epub ( $epub );
2024-06-03 17:38:46 +02:00
}
2024-09-04 17:43:51 +02:00
// Politis RSS
if ( isset ( $_POST [ 'politis' ]) && $_POST [ 'politis' ])
{
2024-09-04 18:20:11 +02:00
$issuenb = $_POST [ 'politis' ];
2024-09-04 17:43:51 +02:00
$feedurl = 'https://www.politis.fr/flux-rss-politis-fr/' ;
$opts = [
'http' => [
'method' => " GET " ,
2024-09-05 14:07:27 +02:00
//'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, // todo?
2024-09-04 17:43:51 +02:00
]
];
$context = stream_context_create ( $opts );
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
$epub -> epub_file = 'epub/politis' . $issuenb . '.epub' ;
$epub -> title = 'Politis ' . $issuenb . ' - RSS ' . $today ;
// load feeds
$feed = file_get_contents ( $feedurl );
$xml = new SimpleXMLElement ( $feed );
$items = $xml -> xpath ( " /rss/channel/item " );
foreach ( array_reverse ( $items ) as $item )
{
$url = $item -> link ;
$article = file_get_contents ( $url , false , $context );
2024-09-04 18:20:11 +02:00
if ( ! str_contains ( $article , ' dans l’ hebdo N° ' . $issuenb ))
2024-09-04 17:43:51 +02:00
{
continue ;
}
$doc = new DOMDocument ();
$doc -> loadHTML ( $article );
$finder = new DomXPath ( $doc );
$title = $finder -> query ( '//h1' ) -> item ( 0 ) -> textContent ;
$author = $finder -> query ( '//span[@class="nom"]' ) -> item ( 0 ) -> textContent ;
$date = $finder -> query ( '//span[@class="date"]' ) -> item ( 0 ) -> textContent ;
$summary = $finder -> query ( '//div[@class="extrait"]' ) -> item ( 0 ) -> textContent ;
$result = '<h1>' . $title . '</h1>' ;
$result .= '<p>' . $author . '</p>' ;
$result .= '<p>' . $date . '</p>' ;
$result .= '<p><b>' . $summary . '</b></p>' ;
$nodes = $finder -> query ( '//div[@id="content"]' );
$node = $nodes -> item ( 0 );
$nodehtml = get_clean_html ( $node );
$result .= $nodehtml ;
$epub -> AddPage ( $result , false , $title );
}
write_epub ( $epub );
}
2024-08-22 12:08:39 +02:00
// Mediapart home page
if ( isset ( $_POST [ 'mphome' ]) && $_POST [ 'mphome' ])
{
$homeurl = 'https://www.mediapart.fr' ;
$opts = [
'http' => [
'method' => " GET " ,
'header' => " Accept-language: en \n Cookie: MPSESSID= " . $mp_sessionid ,
]
];
$context = stream_context_create ( $opts );
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
$epub -> epub_file = 'epub/mediaparthome.epub' ;
2024-08-29 14:44:35 +02:00
$epub -> title = 'Mediapart - A la une ' . $today ;
2024-08-22 12:08:39 +02:00
$home = file_get_contents ( $homeurl , false , $context );
$doc = new DOMDocument ();
$doc -> loadHTML ( $home );
$finder = new DomXPath ( $doc );
2024-08-23 09:31:16 +02:00
$xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href' ;
2024-08-22 14:23:30 +02:00
2024-08-23 09:31:16 +02:00
$links = $finder -> query ( $xpath );
2024-08-22 12:08:39 +02:00
foreach ( $links as $link )
{
2024-08-26 18:15:17 +02:00
add_mp_article ( 'https://www.mediapart.fr' . $link -> value , $context , $epub );
2024-08-22 12:08:39 +02:00
}
write_epub ( $epub );
}
2024-09-05 14:07:27 +02:00
// New York Times RSS
2024-06-03 21:28:52 +02:00
if ( isset ( $_POST [ 'nyt' ]) && $_POST [ 'nyt' ])
{
$feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml' ;
$opts = [
'http' => [
'method' => " GET " ,
'header' => " Accept-language: en \n Cookie: NYT-S= " . $nyt_sessionid ,
]
];
$context = stream_context_create ( $opts );
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
$epub -> epub_file = 'epub/newyorktimes.epub' ;
2024-08-29 14:44:35 +02:00
$epub -> title = 'The New York Times ' . $today ;
2024-06-03 21:28:52 +02:00
// load feeds
$feed = file_get_contents ( $feedurl );
$xml = new SimpleXMLElement ( $feed );
$items = $xml -> xpath ( " /rss/channel/item " );
foreach ( $items as $item )
{
$title = $item -> title ;
$author = $item -> xpath ( 'dc:creator' )[ 0 ];
$summary = $item -> description ;
$article = file_get_contents ( $item -> link , false , $context );
$doc = new DOMDocument ();
$doc -> loadHTML ( $article );
$finder = new DomXPath ( $doc );
$result = '<h1>' . $title . '</h1>' ;
$result .= '<p>' . $author . '</p>' ;
$result .= '<p>' . $item -> pubDate . '</p>' ;
$result .= '<p><b>' . $summary . '</b></p>' ;
$nodes = $finder -> query ( '//section[@name="articleBody"]' );
if ( ! $nodes -> length )
{
echo 'warning: could not get content of "' . $title . '"<br>' ;
}
else
{
$node = $nodes -> item ( 0 );
2024-08-28 15:57:56 +02:00
$nodehtml = get_clean_html ( $node );
2024-08-28 16:55:33 +02:00
$result .= $nodehtml ;
$epub -> AddPage ( $result , false , $title );
2024-06-03 21:28:52 +02:00
}
}
2024-06-04 09:42:41 +02:00
write_epub ( $epub );
2024-06-03 21:28:52 +02:00
}
2024-06-05 12:14:44 +02:00
// New York Times today's paper
2024-06-04 17:19:23 +02:00
if ( isset ( $_POST [ 'nyttoday' ]) && $_POST [ 'nyttoday' ])
{
$todaypage = file_get_contents ( " https://www.nytimes.com/section/todayspaper " );
$opts = [
'http' => [
'method' => " GET " ,
'header' => " Accept-language: en \n Cookie: NYT-S= " . $nyt_sessionid ,
]
];
$context = stream_context_create ( $opts );
$epub = new TPEpubCreator ();
$epub -> temp_folder = 'temp/' ;
$epub -> epub_file = 'epub/newyorktimestoday.epub' ;
2024-08-29 14:44:35 +02:00
$epub -> title = 'The New York Times today\'s paper ' . $today ;
2024-06-04 17:19:23 +02:00
// grab today's paper
$start = strpos ( $todaypage , " window.__preloadedData = " ) + 25 ;
$end = strpos ( $todaypage , " }; " , $start ) + 1 ;
$json = str_replace ( " :undefined " , " :null " , substr ( $todaypage , $start , $end - $start ));
$data = json_decode ( $json );
foreach ( $data -> initialState as $item )
{
if ( $item -> __typename == " Article " )
{
$headline = ( $data -> initialState -> { $item -> headline -> id });
$title = $headline -> default ;
$summary = $item -> summary ;
$result = '<h1>' . $title . '</h1>' ;
2024-06-05 12:51:50 +02:00
// authors
$result .= '<p>' ;
$creators = ( $data -> initialState -> { $item -> bylines [ 0 ] -> id }) -> creators ;
foreach ( $creators as $creator )
{
$author = $data -> initialState -> { $creator -> id };
$result .= $author -> displayName . ' ' ;
}
$result .= '</p>' ;
2024-06-04 17:19:23 +02:00
2024-06-05 12:51:50 +02:00
$result .= '<p>' . $item -> lastMajorModification . '</p>' ;
2024-06-04 17:19:23 +02:00
$result .= '<p><b>' . $summary . '</b></p>' ;
2024-06-03 21:28:52 +02:00
2024-06-04 17:19:23 +02:00
$article = file_get_contents ( $item -> url , false , $context );
$doc = new DOMDocument ();
$doc -> loadHTML ( $article );
$finder = new DomXPath ( $doc );
$nodes = $finder -> query ( '//section[@name="articleBody"]' );
if ( ! $nodes -> length )
{
echo 'warning: could not get content of "' . $title . '"<br>' ;
}
else
{
$node = $nodes -> item ( 0 );
2024-08-28 15:57:56 +02:00
$nodehtml = get_clean_html ( $node );
2024-08-28 16:55:33 +02:00
$result .= $nodehtml ;
$epub -> AddPage ( $result , false , $title );
2024-06-04 17:19:23 +02:00
}
}
}
write_epub ( $epub );
}
?>
2024-08-27 11:13:26 +02:00
< h1 > Generate epub </ h1 >
2024-06-03 21:28:52 +02:00
< div >
2024-05-27 13:28:36 +02:00
< form method = " post " >
2024-06-04 17:19:23 +02:00
2024-09-04 18:25:42 +02:00
< input type = " submit " >
2024-08-27 11:13:26 +02:00
< h2 > Mediapart </ h2 >
2024-06-04 17:19:23 +02:00
< input id = " mediapart " name = " mediapart " type = " checkbox " >
2024-07-07 15:01:30 +02:00
< label for = " mediapart " > From RSS </ label >
2024-06-04 17:19:23 +02:00
< br >
2024-08-22 12:08:39 +02:00
< input id = " mphome " name = " mphome " type = " checkbox " >
< label for = " mphome " > From home page </ label >
< br >
2024-07-08 09:12:26 +02:00
< input id = " forcempcookies " name = " forcempcookies " type = " checkbox " >
2024-07-07 15:01:30 +02:00
< label for = " forcempcookies " > Force cookies </ label >
2024-08-26 11:01:20 +02:00
< br >
2024-08-27 10:40:26 +02:00
< input id = " mptoday " name = " mptoday " type = " checkbox " >
2024-08-26 18:10:14 +02:00
< label for = " mptoday " > Today only </ label >
2024-07-07 15:01:30 +02:00
2024-08-27 15:16:34 +02:00
< h2 > Le Monde </ h2 >
2024-09-04 18:20:11 +02:00
< label for = " lemonde " > " GetPublicationContentItems " json url : </ label >
< input id = " lemonde " name = " lemonde " >
2024-08-27 15:16:34 +02:00
< h2 > Cafeyn </ h2 >
2024-09-04 18:20:11 +02:00
< label for = " cafeyn " > " material " json url : </ label >
< input id = " cafeyn " name = " cafeyn " >
2024-08-27 15:16:34 +02:00
2024-08-27 11:13:26 +02:00
< h2 > The New York Times </ h2 >
2024-06-04 17:19:23 +02:00
< input id = " nyt " name = " nyt " type = " checkbox " >
2024-07-07 15:01:30 +02:00
< label for = " nyt " > From RSS </ label >
2024-06-04 17:19:23 +02:00
< br >
< input id = " nyttoday " name = " nyttoday " type = " checkbox " >
2024-08-27 15:16:34 +02:00
< label for = " nyttoday " > From today ' s paper embedded script </ label >
2024-06-04 17:19:23 +02:00
2024-09-04 17:43:51 +02:00
< h2 > Politis </ h2 >
2024-09-04 18:20:11 +02:00
< label for = " politis " > Issue number : </ label >< input id = " politis " name = " politis " >
2024-09-04 17:43:51 +02:00
2024-08-27 11:13:26 +02:00
< h2 > Admin </ h2 >
2024-07-19 09:30:09 +02:00
< input id = " emptycache " name = " emptycache " type = " checkbox " checked >
2024-07-09 17:14:54 +02:00
< label for = " emptycache " > Empty cache </ label >
2024-08-27 13:51:43 +02:00
< br >< br >
2024-07-09 17:14:54 +02:00
2024-05-27 13:28:36 +02:00
< input type = " submit " >
</ form >
2024-06-03 21:28:52 +02:00
</ div >
2024-08-27 11:13:26 +02:00
2024-08-27 13:51:43 +02:00
< h1 > Links </ h1 >
2024-06-05 12:10:54 +02:00
< a href = " ../epub " > Fichiers générés </ a >< br >
2024-08-26 11:01:20 +02:00
< a href = " ../ " > Revue de presse </ a >< br >
2024-08-27 11:13:26 +02:00
2024-08-27 13:51:43 +02:00
< h1 > Mediapart RSS summary </ h1 >
2024-08-26 11:01:20 +02:00
< ? php
$mprss = file_get_contents ( 'https://www.mediapart.fr/articles/feed' , false , $context );
$xml = new SimpleXMLElement ( $mprss );
$items = $xml -> xpath ( " /rss/channel/item " );
foreach ( $items as $item )
{
echo '<div><a href="' . $item -> link . '">' . $item -> pubDate . ' ' . $item -> title . '</a></div>' ;
}
?>
2024-05-27 13:28:36 +02:00
</ body >
</ html >