-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_crawler.py
More file actions
99 lines (88 loc) · 3.03 KB
/
web_crawler.py
File metadata and controls
99 lines (88 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!usr/bin/python
"""Web Crawler designed to find products and ratings for products developed and targeting seniors.
"""
import os
from urllib.request import urlopen
#from urllib import urlopen (python2)
from bs4 import BeautifulSoup
__author__ = "Disaiah Bennett"
__version__ = "0.1"
class WebCrawler:
"""Web Crawler
"""
def __init__(self, url=None, page=None, data=None, clean=False):
"""This is the inside of my web crawler
url: string - the url
page: object - the url page.
data: object - the page data.
categories: list - navigation categories
catlinks: list - navigation categories links
clean: bool - places csv files into csv directory
count: int - category list item count
"""
self.url = url
self.page = page
self.data = data
self.categories = []
self.catlinks = []
self.clean = clean
self.count = 0
def data_extract(self):
"""Extract the url page data and parses the information with BeautifulSoup
"""
self.page = urlopen(self.url)
self.data = self.page.read()
self.page.close()
soup = BeautifulSoup(self.data, "html.parser")
return soup
def get_url(self):
"""Gets the url that the webcrawler will be accessing.
Returns:
url: string - the url.
Example:
>>> example_url = crawler.get_url()
"""
return self.url
def get_page(self):
"""Gets the page that the webcrawler is parsing data from.
Returns:
self.page: string - the page of the url.
Example:
>>> example_page = crawler.get_page()
"""
return self.page
def get_data(self):
"""Get the data that the webcrawler is parsing
Returns:
self.data: string - page data.
Example:
>>> example_data = crawler.get_data()
"""
return self.data
def get_nav_categories(self):
"""Get the categories parsed within the webcrawler.
Returns:
self.categories: list - list of categories within the navigation bar.
Example:
>>> example_categories = crawler.get_nav_categories()
"""
return self.categories
def get_nav_catlinks(self):
"""Get the category links within the webcrawler.
Returns:
self.catlinks: list - list of category links within the navigation bar.
Example:
>>> example_catlinks = crawler.get_nav_catlinks()
"""
return self.catlinks
def cleanup(self):
"""Clean up csv files in the current directory, and saves them to csv folder.
Returns:
self.clean: bool - file cleaned.
"""
self.clean = True
try:
os.system(". ./move_csv.sh")
except OSError:
print("CLEANING FAILED")
return self.clean