add mediapart from home page
This commit is contained in:
		
							parent
							
								
									bb11d11087
								
							
						
					
					
						commit
						566287d767
					
				
							
								
								
									
										97
									
								
								epub.php
								
								
								
								
							
							
						
						
									
										97
									
								
								epub.php
								
								
								
								
							|  | @ -291,9 +291,6 @@ | |||
| 		write_epub($epub); | ||||
| 	} | ||||
| 
 | ||||
| 	// Mediapart
 | ||||
| 	if (isset($_POST['mediapart']) && $_POST['mediapart']) | ||||
| 	{ | ||||
| 	if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) | ||||
| 	{ | ||||
| 		$old = $mp_sessionid; | ||||
|  | @ -304,6 +301,9 @@ | |||
| 		echo '<div>Updated mpsessid in settings.</div>'; | ||||
| 	} | ||||
| 
 | ||||
| 	// Mediapart RSS
 | ||||
| 	if (isset($_POST['mediapart']) && $_POST['mediapart']) | ||||
| 	{ | ||||
| 		$feedurl = 'https://www.mediapart.fr/articles/feed'; | ||||
| 		$opts = [ | ||||
| 		'http' => [ | ||||
|  | @ -386,6 +386,94 @@ | |||
| 		write_epub($epub); | ||||
| 	} | ||||
| 
 | ||||
| 	// Mediapart home page
 | ||||
| 	if (isset($_POST['mphome']) && $_POST['mphome']) | ||||
| 	{ | ||||
| 		$homeurl = 'https://www.mediapart.fr'; | ||||
| 		$opts = [ | ||||
| 		'http' => [ | ||||
| 			'method' => "GET", | ||||
| 			'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, | ||||
| 		] | ||||
| 		]; | ||||
| 		$context = stream_context_create($opts); | ||||
| 
 | ||||
| 		$epub = new TPEpubCreator(); | ||||
| 		$epub->temp_folder = 'temp/'; | ||||
| 		$epub->epub_file = 'epub/mediaparthome.epub'; | ||||
| 		$epub->title = 'Mediapart home ' . $date; | ||||
| 
 | ||||
| 		$home = file_get_contents($homeurl, false, $context); | ||||
| 		$doc = new DOMDocument(); | ||||
| 		$doc->loadHTML($home); | ||||
| 		$finder = new DomXPath($doc); | ||||
| 
 | ||||
| 		$links = $finder->query('//section[contains(@class, "_hot")]//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); | ||||
| 
 | ||||
| 		foreach ($links as $link) | ||||
| 		{ | ||||
| 			$url = 'https://www.mediapart.fr' . $link->value; | ||||
| 
 | ||||
| 			$article = file_get_contents($url, false, $context); | ||||
| 			$doc = new DOMDocument(); | ||||
| 			$doc->loadHTML($article); | ||||
| 			$finder = new DomXPath($doc); | ||||
| 
 | ||||
| 			$title = $finder->query('//h1')->item(0)->textContent; | ||||
| 			$author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; | ||||
| 			$date = $finder->query('//time')->item(0)->textContent; | ||||
| 			$summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; | ||||
| 
 | ||||
| 			// strip images
 | ||||
| 			$toremove = $finder->query('//svg'); | ||||
| 			foreach ($toremove as $elt) | ||||
| 			{ | ||||
| 				$elt->parentNode->removeChild($elt); | ||||
| 			} | ||||
| 			$toremove = $finder->query('//figure'); | ||||
| 			foreach ($toremove as $elt) | ||||
| 			{ | ||||
| 				$elt->parentNode->removeChild($elt); | ||||
| 			} | ||||
| 			$toremove = $finder->query('//span[@class="screen-reader-only"]'); | ||||
| 			foreach ($toremove as $elt) | ||||
| 			{ | ||||
| 				$elt->parentNode->removeChild($elt); | ||||
| 			} | ||||
| 
 | ||||
| 			$result = '<h1>' . $title . '</h1>'; | ||||
| 			$result .= '<p>' . $author . '</p>'; | ||||
| 			$result .= '<p>' . $date . '</p>'; | ||||
| 			$result .= '<p><b>' . $summary . '</b></p>'; | ||||
| 
 | ||||
| 			$nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); | ||||
| 			if (!$nodes->length) | ||||
| 			{ | ||||
| 				// articles accès libre
 | ||||
| 				$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); | ||||
| 			} | ||||
| 
 | ||||
| 			if (!$nodes->length) | ||||
| 			{ | ||||
| 				echo 'warning: could not get content of "' . $title . '"<br>'; | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				$node = $nodes->item(0); | ||||
| 
 | ||||
| 				$innerHTML = ''; | ||||
| 				foreach ($node->childNodes as $childNode){ | ||||
| 					$innerHTML .= $childNode->ownerDocument->saveHTML($childNode); | ||||
| 				} | ||||
| 				$result .= '<div>' . strip_tags($innerHTML, '<p><b><h2><i>') . '</div>'; | ||||
| 
 | ||||
| 				$epub->AddPage($result, false, $title); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		write_epub($epub); | ||||
| 	} | ||||
| 
 | ||||
| 	// New York Times
 | ||||
| 	if (isset($_POST['nyt']) && $_POST['nyt']) | ||||
| 	{ | ||||
|  | @ -565,6 +653,9 @@ | |||
| 	<input id="mediapart" name="mediapart" type="checkbox"> | ||||
| 	<label for="mediapart">From RSS</label> | ||||
| 	<br> | ||||
| 	<input id="mphome" name="mphome" type="checkbox"> | ||||
| 	<label for="mphome">From home page</label> | ||||
| 	<br> | ||||
| 	<input id="forcempcookies" name="forcempcookies" type="checkbox"> | ||||
| 	<label for="forcempcookies">Force cookies</label> | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 quenousimporte
						quenousimporte