From a2aa42424df298b3cd6d9e831ee150c0c0d696bb Mon Sep 17 00:00:00 2001 From: Joachim Blaafjell Holwech Date: Wed, 21 Oct 2020 11:21:07 +0200 Subject: [PATCH 1/4] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 0a50054..27ccabc 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,10 @@ recent articles from any news-site. Just add the websites you want to scrape to `NewsPapers.json` and the script will go through and scrape each site listed in the file. +This repository was originally created as part of [this tutorial](https://holwech.github.io/blog/Automatic-news-scraper/). + +Thanks to [Pål Grønås Drange](https://github.com/pgdr) for his contributions to the repository. + ## Installing You need to download the content of this repository, then run From 0404b554273afcccf3dbe65c02a1c74168cf6773 Mon Sep 17 00:00:00 2001 From: Fawaz Shah Date: Thu, 22 Apr 2021 16:29:18 +0100 Subject: [PATCH 2/4] Update newsscraper script --- newsscraper.py | 91 +++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/newsscraper.py b/newsscraper.py index e82187a..533ebb7 100644 --- a/newsscraper.py +++ b/newsscraper.py @@ -12,8 +12,7 @@ from newspaper import Article -data = {} -data["newspapers"] = {} +data = {"newspapers": {}} def parse_config(fname): @@ -28,7 +27,7 @@ def parse_config(fname): return cfg -def _handle_rss(company, value, count, limit): +def _handle_rss(company, value, limit): """If a RSS link is provided in the JSON file, this will be the first choice. @@ -39,6 +38,7 @@ def _handle_rss(company, value, count, limit): attr empty in the JSON file. """ + count = 1 fpd = fp.parse(value["rss"]) print(f"Downloading articles from {company}") news_paper = {"rss": value["rss"], "link": value["link"], "articles": []} @@ -50,8 +50,7 @@ def _handle_rss(company, value, count, limit): continue if count > limit: break - article = {} - article["link"] = entry.link + article = {"link": entry.link} date = entry.published_parsed article["published"] = datetime.fromtimestamp(mktime(date)).isoformat() try: @@ -68,11 +67,11 @@ def _handle_rss(company, value, count, limit): article["text"] = content.text news_paper["articles"].append(article) print(f"{count} articles downloaded from {company}, url: {entry.link}") - count = count + 1 - return count, news_paper + count += 1 + return news_paper -def _handle_fallback(company, value, count, limit): +def _handle_fallback(company, url, limit): """This is the fallback method if a RSS-feed link is not provided. It uses the python newspaper library to extract articles. @@ -80,33 +79,41 @@ def _handle_fallback(company, value, count, limit): """ print(f"Building site for {company}") - paper = newspaper.build(value["link"], memoize_articles=False) - news_paper = {"link": value["link"], "articles": []} - none_type_count = 0 + try: + paper = newspaper.build(url, memoize_articles=False) + except: + print("Error building newspaper, aborting...") + return + + news_paper = {"link": url, "articles": []} + print(f"{len(paper.articles)} articles found") + + num_articles_downloaded = 0 + error_count = 0 + for content in paper.articles: - if count > limit: + if num_articles_downloaded >= limit: + break + # After 10 articles with errors from the same newspaper, the company will be skipped. + if error_count > 10: + print("Too many errors for this source, aborting...") break + try: content.download() content.parse() except Exception as err: + error_count += 1 print(err) print("continuing...") continue - # Again, for consistency, if there is no found publish date the - # article will be skipped. - # - # After 10 downloaded articles from the same newspaper without - # publish date, the company will be skipped. - if content.publish_date is None: - print(f"{count} Article has date of type None...") - none_type_count = none_type_count + 1 - if none_type_count > 10: - print("Too many noneType dates, aborting...") - none_type_count = 0 - break - count = count + 1 + + # For consistency, if there is no found publish date the article will be skipped. + if content.publish_date is None or content.publish_date == '': + print(f"Can't find article publish date, skipping...") + error_count += 1 continue + article = { "title": content.title, "text": content.text, @@ -114,35 +121,36 @@ def _handle_fallback(company, value, count, limit): "published": content.publish_date.isoformat(), } news_paper["articles"].append(article) + num_articles_downloaded += 1 print( - f"{count} articles downloaded from {company} using newspaper, url: {content.url}" + f"{num_articles_downloaded} articles downloaded from {company} using newspaper, url: {content.url}" ) - count = count + 1 - none_type_count = 0 - return count, news_paper + + return news_paper -def run(config, limit=4): +def run(config, limit): """Take a config object of sites and urls, and an upper limit. Iterate through each news company. Write result to scraped_articles.json. """ - for company, value in config.items(): - count = 1 + for i, (company, value) in enumerate(config.items()): + print(f"NEWS SITE {i+1} OUT OF {len(config)}") if "rss" in value: - count, news_paper = _handle_rss(company, value, count, limit) + news_paper = _handle_rss(company, value, limit) else: - count, news_paper = _handle_fallback(company, value, count, limit) + url = value["link"] + news_paper = _handle_fallback(company, url, limit) data["newspapers"][company] = news_paper - # Finally it saves the articles as a JSON-file. - try: - with open("scraped_articles.json", "w") as outfile: - json.dump(data, outfile, indent=2) - except Exception as err: - print(err) + # Save collected data to file at each iteration in case of error + try: + with open("scraped_articles.json", "w") as outfile: + json.dump(data, outfile, indent=2) + except Exception as err: + print(err) def main(): @@ -156,7 +164,8 @@ def main(): if len(args) < 2: sys.exit("Usage: newsscraper.py NewsPapers.json") - limit = 4 + limit = 10 + if "--limit" in args: idx = args.index("--limit") limit = int(args[idx + 1]) From 7a7289f1d2ef103cc1f0377442ec9dda45866f7f Mon Sep 17 00:00:00 2001 From: Fawaz Shah Date: Thu, 22 Apr 2021 16:42:21 +0100 Subject: [PATCH 3/4] Change count to num_articles_downloaded, begins from 0 --- newsscraper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/newsscraper.py b/newsscraper.py index 533ebb7..7de6471 100644 --- a/newsscraper.py +++ b/newsscraper.py @@ -38,7 +38,7 @@ def _handle_rss(company, value, limit): attr empty in the JSON file. """ - count = 1 + num_articles_downloaded = 0 fpd = fp.parse(value["rss"]) print(f"Downloading articles from {company}") news_paper = {"rss": value["rss"], "link": value["link"], "articles": []} @@ -48,7 +48,7 @@ def _handle_rss(company, value, limit): # keep the script from crashing. if not hasattr(entry, "published"): continue - if count > limit: + if num_articles_downloaded > limit: break article = {"link": entry.link} date = entry.published_parsed @@ -66,8 +66,8 @@ def _handle_rss(company, value, limit): article["title"] = content.title article["text"] = content.text news_paper["articles"].append(article) - print(f"{count} articles downloaded from {company}, url: {entry.link}") - count += 1 + num_articles_downloaded += 1 + print(f"{num_articles_downloaded} articles downloaded from {company}, url: {entry.link}") return news_paper From 027782c5c3371b9ecb07ba08aad730033038b426 Mon Sep 17 00:00:00 2001 From: Fawaz Shah Date: Thu, 22 Apr 2021 16:59:57 +0100 Subject: [PATCH 4/4] Change 'company' to 'newspaper' --- newsscraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newsscraper.py b/newsscraper.py index 7de6471..0402682 100644 --- a/newsscraper.py +++ b/newsscraper.py @@ -94,7 +94,7 @@ def _handle_fallback(company, url, limit): for content in paper.articles: if num_articles_downloaded >= limit: break - # After 10 articles with errors from the same newspaper, the company will be skipped. + # After 10 articles with errors from the same newspaper, the newspaper will be skipped. if error_count > 10: print("Too many errors for this source, aborting...") break