python-headline-scraper/scraper.py at main · HuntedCode/python-headline-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from bs4 import BeautifulSoup
import requests


def fetch_headlines(url="https://news.ycombinator.com/", limit=10):
    """Fetches headlines from param URL with param limit. Defaults to ycombinator.com & limit of 10."""

    response = requests.get(url, headers={"User-Agent": "PythonPracticeScraper/1.0"})

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        title_spans = soup.find_all("span", class_="titleline", limit=limit)
        subline_spans = soup.find_all("span", class_="subline", limit=limit)
        zipped_spans = list(zip(title_spans, subline_spans))

        try:
            headlines = []
            for span in zipped_spans:
                title_tag = span[0].find("a")
                headlines.append({
                    'title': title_tag.text,
                    'link': title_tag['href'],
                    'score': span[1].find("span", class_="score").text.split()[0],
                    'author': span[1].find("a", class_="hnuser").text,
                    'date': span[1].find("span", class_="age")['title'].split()[0]
                    })
        except:
            print("There was an error processing headlines. Please try again later.")
            return []

        return headlines
    else:
        print("Webpage didn't return correctly!")