add mediapart from home page
This commit is contained in:
		
							parent
							
								
									bb11d11087
								
							
						
					
					
						commit
						566287d767
					
				
							
								
								
									
										113
									
								
								epub.php
								
								
								
								
							
							
						
						
									
										113
									
								
								epub.php
								
								
								
								
							|  | @ -291,19 +291,19 @@ | ||||||
| 		write_epub($epub); | 		write_epub($epub); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// Mediapart
 | 	if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) | ||||||
|  | 	{ | ||||||
|  | 		$old = $mp_sessionid; | ||||||
|  | 		$mp_sessionid = get_mp_cookie(); | ||||||
|  | 		$settings = file_get_contents('settings.php'); | ||||||
|  | 		$settings = str_replace($old, $mp_sessionid, $settings); | ||||||
|  | 		file_put_contents('settings.php', $settings); | ||||||
|  | 		echo '<div>Updated mpsessid in settings.</div>'; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	// Mediapart RSS
 | ||||||
| 	if (isset($_POST['mediapart']) && $_POST['mediapart']) | 	if (isset($_POST['mediapart']) && $_POST['mediapart']) | ||||||
| 	{ | 	{ | ||||||
| 		if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) |  | ||||||
| 		{ |  | ||||||
| 			$old = $mp_sessionid; |  | ||||||
| 			$mp_sessionid = get_mp_cookie(); |  | ||||||
| 			$settings = file_get_contents('settings.php'); |  | ||||||
| 			$settings = str_replace($old, $mp_sessionid, $settings); |  | ||||||
| 			file_put_contents('settings.php', $settings); |  | ||||||
| 			echo '<div>Updated mpsessid in settings.</div>'; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		$feedurl = 'https://www.mediapart.fr/articles/feed'; | 		$feedurl = 'https://www.mediapart.fr/articles/feed'; | ||||||
| 		$opts = [ | 		$opts = [ | ||||||
| 		'http' => [ | 		'http' => [ | ||||||
|  | @ -386,6 +386,94 @@ | ||||||
| 		write_epub($epub); | 		write_epub($epub); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	// Mediapart home page
 | ||||||
|  | 	if (isset($_POST['mphome']) && $_POST['mphome']) | ||||||
|  | 	{ | ||||||
|  | 		$homeurl = 'https://www.mediapart.fr'; | ||||||
|  | 		$opts = [ | ||||||
|  | 		'http' => [ | ||||||
|  | 			'method' => "GET", | ||||||
|  | 			'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, | ||||||
|  | 		] | ||||||
|  | 		]; | ||||||
|  | 		$context = stream_context_create($opts); | ||||||
|  | 
 | ||||||
|  | 		$epub = new TPEpubCreator(); | ||||||
|  | 		$epub->temp_folder = 'temp/'; | ||||||
|  | 		$epub->epub_file = 'epub/mediaparthome.epub'; | ||||||
|  | 		$epub->title = 'Mediapart home ' . $date; | ||||||
|  | 
 | ||||||
|  | 		$home = file_get_contents($homeurl, false, $context); | ||||||
|  | 		$doc = new DOMDocument(); | ||||||
|  | 		$doc->loadHTML($home); | ||||||
|  | 		$finder = new DomXPath($doc); | ||||||
|  | 
 | ||||||
|  | 		$links = $finder->query('//section[contains(@class, "_hot")]//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); | ||||||
|  | 
 | ||||||
|  | 		foreach ($links as $link) | ||||||
|  | 		{ | ||||||
|  | 			$url = 'https://www.mediapart.fr' . $link->value; | ||||||
|  | 
 | ||||||
|  | 			$article = file_get_contents($url, false, $context); | ||||||
|  | 			$doc = new DOMDocument(); | ||||||
|  | 			$doc->loadHTML($article); | ||||||
|  | 			$finder = new DomXPath($doc); | ||||||
|  | 
 | ||||||
|  | 			$title = $finder->query('//h1')->item(0)->textContent; | ||||||
|  | 			$author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; | ||||||
|  | 			$date = $finder->query('//time')->item(0)->textContent; | ||||||
|  | 			$summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; | ||||||
|  | 
 | ||||||
|  | 			// strip images
 | ||||||
|  | 			$toremove = $finder->query('//svg'); | ||||||
|  | 			foreach ($toremove as $elt) | ||||||
|  | 			{ | ||||||
|  | 				$elt->parentNode->removeChild($elt); | ||||||
|  | 			} | ||||||
|  | 			$toremove = $finder->query('//figure'); | ||||||
|  | 			foreach ($toremove as $elt) | ||||||
|  | 			{ | ||||||
|  | 				$elt->parentNode->removeChild($elt); | ||||||
|  | 			} | ||||||
|  | 			$toremove = $finder->query('//span[@class="screen-reader-only"]'); | ||||||
|  | 			foreach ($toremove as $elt) | ||||||
|  | 			{ | ||||||
|  | 				$elt->parentNode->removeChild($elt); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			$result = '<h1>' . $title . '</h1>'; | ||||||
|  | 			$result .= '<p>' . $author . '</p>'; | ||||||
|  | 			$result .= '<p>' . $date . '</p>'; | ||||||
|  | 			$result .= '<p><b>' . $summary . '</b></p>'; | ||||||
|  | 
 | ||||||
|  | 			$nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); | ||||||
|  | 			if (!$nodes->length) | ||||||
|  | 			{ | ||||||
|  | 				// articles accès libre
 | ||||||
|  | 				$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			if (!$nodes->length) | ||||||
|  | 			{ | ||||||
|  | 				echo 'warning: could not get content of "' . $title . '"<br>'; | ||||||
|  | 			} | ||||||
|  | 			else | ||||||
|  | 			{ | ||||||
|  | 				$node = $nodes->item(0); | ||||||
|  | 
 | ||||||
|  | 				$innerHTML = ''; | ||||||
|  | 				foreach ($node->childNodes as $childNode){ | ||||||
|  | 					$innerHTML .= $childNode->ownerDocument->saveHTML($childNode); | ||||||
|  | 				} | ||||||
|  | 				$result .= '<div>' . strip_tags($innerHTML, '<p><b><h2><i>') . '</div>'; | ||||||
|  | 
 | ||||||
|  | 				$epub->AddPage($result, false, $title); | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		write_epub($epub); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	// New York Times
 | 	// New York Times
 | ||||||
| 	if (isset($_POST['nyt']) && $_POST['nyt']) | 	if (isset($_POST['nyt']) && $_POST['nyt']) | ||||||
| 	{ | 	{ | ||||||
|  | @ -565,6 +653,9 @@ | ||||||
| 	<input id="mediapart" name="mediapart" type="checkbox"> | 	<input id="mediapart" name="mediapart" type="checkbox"> | ||||||
| 	<label for="mediapart">From RSS</label> | 	<label for="mediapart">From RSS</label> | ||||||
| 	<br> | 	<br> | ||||||
|  | 	<input id="mphome" name="mphome" type="checkbox"> | ||||||
|  | 	<label for="mphome">From home page</label> | ||||||
|  | 	<br> | ||||||
| 	<input id="forcempcookies" name="forcempcookies" type="checkbox"> | 	<input id="forcempcookies" name="forcempcookies" type="checkbox"> | ||||||
| 	<label for="forcempcookies">Force cookies</label> | 	<label for="forcempcookies">Force cookies</label> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	 quenousimporte
						quenousimporte