From a2aa42424df298b3cd6d9e831ee150c0c0d696bb Mon Sep 17 00:00:00 2001
From: Joachim Blaafjell Holwech <holwech@users.noreply.github.com>
Date: Wed, 21 Oct 2020 11:21:07 +0200
Subject: [PATCH 1/4] Update README.md

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 0a50054..27ccabc 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,10 @@ recent articles from any news-site.
 Just add the websites you want to scrape to `NewsPapers.json` and the
 script will go through and scrape each site listed in the file.
 
+This repository was originally created as part of [this tutorial](https://holwech.github.io/blog/Automatic-news-scraper/).
+
+Thanks to [Pål Grønås Drange](https://github.com/pgdr) for his contributions to the repository.
+
 ## Installing
 
 You need to download the content of this repository, then run

From 0404b554273afcccf3dbe65c02a1c74168cf6773 Mon Sep 17 00:00:00 2001
From: Fawaz Shah <fawaz010@gmail.com>
Date: Thu, 22 Apr 2021 16:29:18 +0100
Subject: [PATCH 2/4] Update newsscraper script

---
 newsscraper.py | 91 +++++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

diff --git a/newsscraper.py b/newsscraper.py
index e82187a..533ebb7 100644
--- a/newsscraper.py
+++ b/newsscraper.py
@@ -12,8 +12,7 @@
 from newspaper import Article
 
 
-data = {}
-data["newspapers"] = {}
+data = {"newspapers": {}}
 
 
 def parse_config(fname):
@@ -28,7 +27,7 @@ def parse_config(fname):
     return cfg
 
 
-def _handle_rss(company, value, count, limit):
+def _handle_rss(company, value, limit):
     """If a RSS link is provided in the JSON file, this will be the first
     choice.
 
@@ -39,6 +38,7 @@ def _handle_rss(company, value, count, limit):
     attr empty in the JSON file.
     """
 
+    count = 1
     fpd = fp.parse(value["rss"])
     print(f"Downloading articles from {company}")
     news_paper = {"rss": value["rss"], "link": value["link"], "articles": []}
@@ -50,8 +50,7 @@ def _handle_rss(company, value, count, limit):
             continue
         if count > limit:
             break
-        article = {}
-        article["link"] = entry.link
+        article = {"link": entry.link}
         date = entry.published_parsed
         article["published"] = datetime.fromtimestamp(mktime(date)).isoformat()
         try:
@@ -68,11 +67,11 @@ def _handle_rss(company, value, count, limit):
         article["text"] = content.text
         news_paper["articles"].append(article)
         print(f"{count} articles downloaded from {company}, url: {entry.link}")
-        count = count + 1
-    return count, news_paper
+        count += 1
+    return news_paper
 
 
-def _handle_fallback(company, value, count, limit):
+def _handle_fallback(company, url, limit):
     """This is the fallback method if a RSS-feed link is not provided.
 
     It uses the python newspaper library to extract articles.
@@ -80,33 +79,41 @@ def _handle_fallback(company, value, count, limit):
     """
 
     print(f"Building site for {company}")
-    paper = newspaper.build(value["link"], memoize_articles=False)
-    news_paper = {"link": value["link"], "articles": []}
-    none_type_count = 0
+    try:
+        paper = newspaper.build(url, memoize_articles=False)
+    except:
+        print("Error building newspaper, aborting...")
+        return
+
+    news_paper = {"link": url, "articles": []}
+    print(f"{len(paper.articles)} articles found")
+
+    num_articles_downloaded = 0
+    error_count = 0
+
     for content in paper.articles:
-        if count > limit:
+        if num_articles_downloaded >= limit:
+            break
+        # After 10 articles with errors from the same newspaper, the company will be skipped.
+        if error_count > 10:
+            print("Too many errors for this source, aborting...")
             break
+
         try:
             content.download()
             content.parse()
         except Exception as err:
+            error_count += 1
             print(err)
             print("continuing...")
             continue
-        # Again, for consistency, if there is no found publish date the
-        # article will be skipped.
-        #
-        # After 10 downloaded articles from the same newspaper without
-        # publish date, the company will be skipped.
-        if content.publish_date is None:
-            print(f"{count} Article has date of type None...")
-            none_type_count = none_type_count + 1
-            if none_type_count > 10:
-                print("Too many noneType dates, aborting...")
-                none_type_count = 0
-                break
-            count = count + 1
+
+        # For consistency, if there is no found publish date the article will be skipped.
+        if content.publish_date is None or content.publish_date == '':
+            print(f"Can't find article publish date, skipping...")
+            error_count += 1
             continue
+
         article = {
             "title": content.title,
             "text": content.text,
@@ -114,35 +121,36 @@ def _handle_fallback(company, value, count, limit):
             "published": content.publish_date.isoformat(),
         }
         news_paper["articles"].append(article)
+        num_articles_downloaded += 1
         print(
-            f"{count} articles downloaded from {company} using newspaper, url: {content.url}"
+            f"{num_articles_downloaded} articles downloaded from {company} using newspaper, url: {content.url}"
         )
-        count = count + 1
-        none_type_count = 0
-    return count, news_paper
+
+    return news_paper
 
 
-def run(config, limit=4):
+def run(config, limit):
     """Take a config object of sites and urls, and an upper limit.
 
     Iterate through each news company.
 
     Write result to scraped_articles.json.
     """
-    for company, value in config.items():
-        count = 1
+    for i, (company, value) in enumerate(config.items()):
+        print(f"NEWS SITE {i+1} OUT OF {len(config)}")
         if "rss" in value:
-            count, news_paper = _handle_rss(company, value, count, limit)
+            news_paper = _handle_rss(company, value, limit)
         else:
-            count, news_paper = _handle_fallback(company, value, count, limit)
+            url = value["link"]
+            news_paper = _handle_fallback(company, url, limit)
         data["newspapers"][company] = news_paper
 
-    # Finally it saves the articles as a JSON-file.
-    try:
-        with open("scraped_articles.json", "w") as outfile:
-            json.dump(data, outfile, indent=2)
-    except Exception as err:
-        print(err)
+        # Save collected data to file at each iteration in case of error
+        try:
+            with open("scraped_articles.json", "w") as outfile:
+                json.dump(data, outfile, indent=2)
+        except Exception as err:
+            print(err)
 
 
 def main():
@@ -156,7 +164,8 @@ def main():
     if len(args) < 2:
         sys.exit("Usage: newsscraper.py NewsPapers.json")
 
-    limit = 4
+    limit = 10
+
     if "--limit" in args:
         idx = args.index("--limit")
         limit = int(args[idx + 1])

From 7a7289f1d2ef103cc1f0377442ec9dda45866f7f Mon Sep 17 00:00:00 2001
From: Fawaz Shah <fawaz010@gmail.com>
Date: Thu, 22 Apr 2021 16:42:21 +0100
Subject: [PATCH 3/4] Change count to num_articles_downloaded, begins from 0

---
 newsscraper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/newsscraper.py b/newsscraper.py
index 533ebb7..7de6471 100644
--- a/newsscraper.py
+++ b/newsscraper.py
@@ -38,7 +38,7 @@ def _handle_rss(company, value, limit):
     attr empty in the JSON file.
     """
 
-    count = 1
+    num_articles_downloaded = 0
     fpd = fp.parse(value["rss"])
     print(f"Downloading articles from {company}")
     news_paper = {"rss": value["rss"], "link": value["link"], "articles": []}
@@ -48,7 +48,7 @@ def _handle_rss(company, value, limit):
         # keep the script from crashing.
         if not hasattr(entry, "published"):
             continue
-        if count > limit:
+        if num_articles_downloaded > limit:
             break
         article = {"link": entry.link}
         date = entry.published_parsed
@@ -66,8 +66,8 @@ def _handle_rss(company, value, limit):
         article["title"] = content.title
         article["text"] = content.text
         news_paper["articles"].append(article)
-        print(f"{count} articles downloaded from {company}, url: {entry.link}")
-        count += 1
+        num_articles_downloaded += 1
+        print(f"{num_articles_downloaded} articles downloaded from {company}, url: {entry.link}")
     return news_paper
 
 

From 027782c5c3371b9ecb07ba08aad730033038b426 Mon Sep 17 00:00:00 2001
From: Fawaz Shah <fawaz010@gmail.com>
Date: Thu, 22 Apr 2021 16:59:57 +0100
Subject: [PATCH 4/4] Change 'company' to 'newspaper'

---
 newsscraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/newsscraper.py b/newsscraper.py
index 7de6471..0402682 100644
--- a/newsscraper.py
+++ b/newsscraper.py
@@ -94,7 +94,7 @@ def _handle_fallback(company, url, limit):
     for content in paper.articles:
         if num_articles_downloaded >= limit:
             break
-        # After 10 articles with errors from the same newspaper, the company will be skipped.
+        # After 10 articles with errors from the same newspaper, the newspaper will be skipped.
         if error_count > 10:
             print("Too many errors for this source, aborting...")
             break