From a5b4b1e7356c3e5bcae624262ff3a7905fa569d8 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Thu, 23 May 2024 17:00:20 +0200 Subject: [PATCH] less scraping --- mediapart.php | 215 +++++++++++++++++++------------------------------- 1 file changed, 83 insertions(+), 132 deletions(-) diff --git a/mediapart.php b/mediapart.php index 19b9fe3..f45cc87 100644 --- a/mediapart.php +++ b/mediapart.php @@ -1,133 +1,84 @@ -textContent; - $content = str_replace('Partager sur Facebook', '', $content); - $content = str_replace('Partager sur X (ex Twitter)', '', $content); - - $content = str_replace('Lire + tard', '', $content); - $content = str_replace('Offrir l’article', '', $content); - $content = str_replace(' PDF', '', $content); - $content = str_replace('Cette fonctionnalité est actuellement en maintenance.La génération de PDF reste disponible via la fonctionnalité «Imprimer».', '', $content); - - $content = str_replace("Fermer\n", '', $content); - $content = str_replace("Recommander\n", '', $content); - $content = str_replace("Commenter\n", '', $content); - - - $content = str_replace(' ', '', $content); - $content = str_replace("\n", "\n\n", $content); - for ($i = 0; $i < 15; $i++) - { - $content = str_replace("\n\n\n", "\n\n", $content); - } - - return $content; -} - -$sessionid = 'value of cookie MPSESSID'; -$feedurl = 'https://www.mediapart.fr/articles/feed'; -$opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: MPSESSID=" . $sessionid, - ] -]; -$context = stream_context_create($opts); - -// load feeds -$feed = file_get_contents($feedurl); -$xml = new SimpleXMLElement($feed); -$items = $xml->xpath("/rss/channel/item"); - -setlocale(LC_ALL, 'fr_FR.UTF8', 'fr_FR','fr','fr','fra','fr_FR@euro'); -$localedate = strftime("%A %d %B %Y"); - -$header = "Mediapart - " . $localedate . "\n\nSommaire:\n"; -$result = ''; - -foreach ($items as $item) -{ - if (str_starts_with($item->pubDate, date("D, j M Y"))) - { - $title = $item->title; - - // Add to content - $header .= '- ' . $title . "\n"; - - $article = file_get_contents($item->link, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $category = trim($finder->query('//p[@class="news__heading__top__kicker _default"]')->item(0)->textContent); - $summary = trim($finder->query('//p[@class="news__heading__top__intro margin-top:500"]')->item(0)->textContent); - $author = trim($finder->query('//a[@class="focus color:text-brand heading-product:100"]')->item(0)->textContent); - - // clean images - $figures = $finder->query('//figure'); - foreach ($figures as $figure) - { - $figure->parentNode->removeChild($figure); - } - - // clean "à lire aussi" and inline quotes - $asides = $finder->query('//aside'); - foreach ($asides as $aside) - { - $aside->parentNode->removeChild($aside); - } - - // improve section titles - $asides = $finder->query('//h2'); - foreach ($asides as $aside) - { - $aside->textContent = '*' . $aside->textContent . '*'; - } - - // Output - - // old: add header as one block - //$part1 = $finder->query('//div[@class="news__heading grid"]'); - //$result .= cleantext($part1->item(0)); - - $result .= "\n------\n\n"; - $result .= "Article : " . $item->title . "\n"; - $result .= 'Catégorie: ' . str_replace("\n", '', $category) . "\n"; - $result .= 'Auteur.ice : ' . $author . "\n"; - $result .= "\n"; - $result .= "Introduction :\n" . $summary . "\n"; - - // article main text - $maintext = $finder->query('//div[@class="news__body__center"]'); - $result .= cleantext($maintext->item(0)); - } - //break; -} - -// web version: - -/* -file_put_contents('mediapart.txt', $header . "\n" . $result); - -echo 'Mediapart - ' . $date . ''; -echo '
mediapart.txt
'; - -echo '
'; -echo str_replace("\n", '
', $header) . '
' . str_replace("\n", '
', $result); -echo '
'; -echo ''; -*/ - -$filename = 'mediapart' . str_replace(' ', '', $localedate) . '.txt'; -header('Content-Type: application/text; charset=utf-8'); -header('Content-Disposition: attachment; filename=' . $filename); - -echo $header . "\n" . $result; - + [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: MPSESSID=" . $sessionid, + ] +]; +$context = stream_context_create($opts); + +// load feeds +$feed = file_get_contents($feedurl); +$xml = new SimpleXMLElement($feed); +$items = $xml->xpath("/rss/channel/item"); + +setlocale(LC_ALL, 'fr_FR.UTF8', 'fr_FR','fr','fr','fra','fr_FR@euro'); +$localedate = strftime("%A %d %B %Y"); + +$header = "Mediapart - " . $localedate . "\n\nSommaire:\n"; +$result = ''; + +foreach ($items as $item) +{ + if (str_starts_with($item->pubDate, date("D, j M Y"))) + { + $title = $item->title; + //$category = $item->category; + $category = $item->xpath('dc:subject')[0]; + $author = $item->xpath('dc:creator')[0]; + $summary = $item->description; + + // Add to content + $header .= '- ' . $title . "\n"; + + $article = file_get_contents($item->link, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + + $finder = new DomXPath($doc); + + // clean images + $figures = $finder->query('//figure'); + foreach ($figures as $figure) + { + $figure->parentNode->removeChild($figure); + } + + // clean "à lire aussi" and inline quotes + $asides = $finder->query('//aside'); + foreach ($asides as $aside) + { + $aside->parentNode->removeChild($aside); + } + + // improve section titles + $asides = $finder->query('//h2'); + foreach ($asides as $aside) + { + $aside->textContent = "\n*" . $aside->textContent . "*\n"; + } + + // Output + $result .= "\n------\n\n"; + $result .= "Article : " . $item->title . "\n"; + $result .= 'Catégorie : ' . str_replace("\n", '', $category) . "\n"; + $result .= 'Auteur.ice : ' . $author . "\n"; + $result .= "Introduction : " . $summary . "\n"; + $result .= "\n"; + + // article main text + $maintext = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + $result .= $maintext->item(0)->textContent; + } +} + +$filename = 'mediapart' . str_replace(' ', '', $localedate) . '.txt'; + +header('Content-Type: application/text; charset=utf-8'); +header('Content-Disposition: attachment; filename=' . $filename); +echo $header . "\n" . $result; + ?> \ No newline at end of file