diff --git a/Python/web_scraper/README.md b/Python/web_scraper/README.md new file mode 100644 index 0000000..0525c2b --- /dev/null +++ b/Python/web_scraper/README.md @@ -0,0 +1,121 @@ +# Web Scraper + +A Python command-line tool for scraping news articles from websites using the `newspaper3k` library. The tool can extract individual articles or all articles from a news website and export them to JSON or CSV format. + +## Features + +- **Single Article Scraping**: Extract content from a specific article URL +- **Bulk Article Scraping**: Scrape all articles linked from a news website homepage +- **Multiple Export Formats**: Export data as JSON or CSV +- **Custom File Names**: Specify custom output file names +- **Article Metadata**: Extract title, authors, publication date, content, and URL + +## Installation + +1. Ensure you have Python 3.6+ installed +2. Install the required dependencies: + +```bash +pip install newspaper3k +``` + +## Usage + +### Basic Single Article Scraping + +```bash +python web_scraper.py "https://example.com/news-article" +``` + +This will create a `news.json` file with the scraped article data. + +### Scrape All Articles from a News Site + +```bash +python web_scraper.py "https://example-news.com" --all-articles +``` + +### Export to CSV Format + +```bash +python web_scraper.py "https://example.com/article" --csv-format +``` + +### Custom Output File Name + +```bash +python web_scraper.py "https://example.com/article" --file my_articles +``` + +### Combine Options + +```bash +# Scrape all articles and export as CSV with custom filename +python web_scraper.py "https://example-news.com" -a -csv -f my_data +``` + +## Command Line Arguments + +| Argument | Short | Description | Default | +|----------|-------|-------------|---------| +| `url` | - | URL of the webpage to scrape (required) | - | +| `--file` | `-f` | Custom output filename | `news` | +| `--csv-format` | `-csv` | Export to CSV instead of JSON | `False` | +| `--all-articles` | `-a` | Scrape all articles from the site | `False` | + +## Output Format + +### JSON Output +```json +[ + { + "title": "Article Title", + "authors": ["Author One", "Author Two"], + "publish_date": "2023-10-15 14:30:00", + "text": "Full article content...", + "url": "https://example.com/article" + } +] +``` + +### CSV Output +The CSV file will contain columns for: +- `title` +- `authors` (as a string representation of the list) +- `publish_date` +- `text` +- `url` + +## Examples + +1. **Scrape a single article to JSON:** + ```bash + python web_scraper.py "https://www.bbc.com/news/world-us-canada-12345678" + ``` + +2. **Scrape all articles from CNN and export as CSV:** + ```bash + python web_scraper.py "https://www.cnn.com" -a -csv -f cnn_articles + ``` + +3. **Scrape with custom JSON filename:** + ```bash + python web_scraper.py "https://example.com/article" -f my_article_data + ``` + +## Notes + +- The tool uses the `newspaper3k` library which may not work with all websites, especially those with heavy JavaScript rendering or anti-scraping measures +- Some news sites may block automated scraping attempts +- The quality of extracted content depends on the website's structure and the `newspaper3k` library's parsing capabilities +- For sites with many articles, using `--all-articles` may take considerable time + +## Error Handling + +- If scraping fails, the tool will display an error message +- Empty results will be indicated with appropriate messages +- Network issues and parsing errors are caught and reported + +## License + +This tool is provided for educational and personal use. Please respect website terms of service and robots.txt files when scraping. \ No newline at end of file diff --git a/Python/web_scraper/web_scraper.py b/Python/web_scraper/web_scraper.py new file mode 100644 index 0000000..44edd4d --- /dev/null +++ b/Python/web_scraper/web_scraper.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3 + +import csv +import newspaper +import argparse +import json +from datetime import datetime + +class WebScraper: + def __init__(self, url, file_name='news', export_format='json'): + self.url = url + + if export_format not in ['json', 'csv']: + raise ValueError('Export format must be either json or csv.') + + self.export_format = export_format + + if export_format == 'json' and not file_name.endswith('.json'): + self.FILE_NAME = file_name + '.json' + elif export_format == 'csv' and not file_name.endswith('.csv'): + self.FILE_NAME = file_name + '.csv' + else: + self.FILE_NAME = file_name + + def export_to_JSON(self, articles): + with open(self.FILE_NAME, 'w') as f: + articles_dict = [article for article in articles] + json.dump(articles_dict, f, indent=2) + + def export_to_CSV(self, articles): + with open(self.FILE_NAME, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=['title', 'authors', 'publish_date', 'text', 'url']) + writer.writeheader() + for article in articles: + writer.writerow(article) + + def get_one_article(self, url=None): + target_url = url or self.url + try: + article = newspaper.Article(target_url) + article.download() + article.parse() + summary = { + 'title': article.title or "No title found", + 'authors': article.authors or ["Unknown author"], + 'publish_date': article.publish_date.strftime('%Y-%m-%d %H:%M:%S') if article.publish_date else None, + 'text': article.text or "No content found", + 'url': target_url + } + return summary + + except Exception as e: + print(f'Error scraping {target_url}: {e}') + return None + + def get_all_articles(self): + try: + summaries = [] + paper = newspaper.build(self.url, memoize_articles=False) + for art in paper.articles: + summary = self.get_one_article(art.url) + if summary: + summaries.append(summary) + return summaries + + except Exception as e: + print(f'Error building newspaper from {self.url}: {e}') + return [] + + +def main(): + parser = argparse.ArgumentParser(description='Web Scraper for News') + parser.add_argument('url', help='URL of the webpage to scrape') + parser.add_argument('--file', '-f', default='news', + help='Custom output file (default: news.json or news.csv)') + parser.add_argument('--csv-format', '-csv', action='store_true', + help='Export to CSV format instead of JSON format') + parser.add_argument('--all-articles', '-a', action='store_true', + help='Get all articles linked to URL instead of only the article from the URL itself') + + args = parser.parse_args() + + export_format = 'csv' if args.csv_format else 'json' + + try: + web_scraper = WebScraper( + url=args.url, + file_name=args.file, + export_format=export_format + ) + + if args.all_articles: + articles = web_scraper.get_all_articles() + else: + single_article = web_scraper.get_one_article() + articles = [single_article] if single_article else [] + + article_count = len(articles) + + if articles: + if export_format == 'json': + web_scraper.export_to_JSON(articles) + else: + web_scraper.export_to_CSV(articles) + + print(f'Successfully exported {article_count} articles to {web_scraper.FILE_NAME}') + else: + print('No articles found to export.') + + except Exception as e: + print(f'Error: {e}') + + +if __name__ == '__main__': + main() \ No newline at end of file