web_crawler.github.io/web_crawler.py at master · dislbenn/web_crawler.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!usr/bin/python
"""Web Crawler designed to find products and ratings for products developed and targeting seniors.
"""
import os
from urllib.request import urlopen
#from urllib import urlopen (python2)
from bs4 import BeautifulSoup

__author__ = "Disaiah Bennett"
__version__ = "0.1"

class WebCrawler:
    """Web Crawler
    """
    def __init__(self, url=None, page=None, data=None, clean=False):
        """This is the inside of my web crawler
            url: string - the url
            page: object - the url page.
            data: object - the page data.
            categories: list - navigation categories
            catlinks: list - navigation categories links
            clean: bool - places csv files into csv directory
            count: int - category list item count
        """
        self.url = url
        self.page = page
        self.data = data
        self.categories = []
        self.catlinks = []
        self.clean = clean
        self.count = 0

    def data_extract(self):
        """Extract the url page data and parses the information with BeautifulSoup
        """
        self.page = urlopen(self.url)
        self.data = self.page.read()

        self.page.close()
        soup = BeautifulSoup(self.data, "html.parser")

        return soup

    def get_url(self):
        """Gets the url that the webcrawler will be accessing.
            Returns:
                url: string - the url.
            Example:
                >>> example_url = crawler.get_url()
        """
        return self.url

    def get_page(self):
        """Gets the page that the webcrawler is parsing data from.
            Returns:
                self.page: string - the page of the url.
            Example:
                >>> example_page = crawler.get_page()
        """
        return self.page

    def get_data(self):
        """Get the data that the webcrawler is parsing
            Returns:
                self.data: string - page data.
            Example:
                >>> example_data = crawler.get_data()
        """
        return self.data

    def get_nav_categories(self):
        """Get the categories parsed within the webcrawler.
            Returns:
                self.categories: list - list of categories within the navigation bar.
            Example:
                >>> example_categories = crawler.get_nav_categories()
        """
        return self.categories

    def get_nav_catlinks(self):
        """Get the category links within the webcrawler.
            Returns:
                self.catlinks: list - list of category links within the navigation bar.
            Example:
                >>> example_catlinks = crawler.get_nav_catlinks()
        """
        return self.catlinks

    def cleanup(self):
        """Clean up csv files in the current directory, and saves them to csv folder.
            Returns:
                self.clean: bool - file cleaned.
        """
        self.clean = True
        try:
            os.system(". ./move_csv.sh")
        except OSError:
            print("CLEANING FAILED")
        return self.clean