-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextminingcacheexample.py
More file actions
126 lines (101 loc) · 4.15 KB
/
textminingcacheexample.py
File metadata and controls
126 lines (101 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Example class for downloading and storing text files.
Uses a local file cache to avoid repeatedly downloading
>>> moby = Text("http://www.gutenberg.org/files/2701/2701-0.txt") # doctest: +ELLIPSIS
INFO: 'http://www.gutenberg.org/files/2701/2701-0.txt' found in ...
>>> print(moby.text[27802:27818])
Call me Ishmael.
"""
# In your code you can try using a database or pickle instead of saving files
# This example does not do any processing or cleaning on the text data
# (but you should)
import os
import requests
import sys
import time
def strip_scheme(url):
"""
Return 'url' without scheme part (e.g. "http://")
>>> strip_scheme("https://www.example.com")
'www.example.com'
>>> strip_scheme("http://www.gutenberg.org/files/2701/2701-0.txt")
'www.gutenberg.org/files/2701/2701-0.txt'
"""
# TODO: This ad-hoc implementation is fairly fragile
# and doesn't support e.g. URL parameters (e.g. ?sort=reverse&lang=fr)
# For a more robust implementation, consider using
# https://docs.python.org/3/library/urllib.parse.html
scheme, remainder = url.split("://")
return remainder
class Text:
"""
Text class holds text-based information downloaded from the web
It uses local file caching to avoid downloading a given file multiple times,
even across multiple runs of the programself.
"""
def __init__(self, url, file_cache=os.path.join(sys.path[0], "cache")):
"""
Given 'url' of a text file, create a new instance with the
text attribute set by either downloading the URL or retrieving
it from local text cache.
Optional 'file_cache' argument specifies where text cache should be
stored (default: same directory as the script in a "cache" folder)
"""
self.url = url
self.local_fn = os.path.join(file_cache, strip_scheme(url))
# First see if file is already in local file cache
if self.is_cached():
print("INFO: {url!r} found in local file cache, reading".format(url=self.url))
self.read_cache()
# If not found, download (and write to local file cache)
else:
print("INFO: {url!r} not found in local file cache, downloading".format(url=self.url))
self.download()
self.write_cache()
def __repr__(self):
return "Text({url!r})".format(url=self.url)
def is_cached(self):
"""Return True if file is already in local file cache"""
return os.path.exists(self.local_fn)
def download(self):
"""Download URL and save to .text attribute"""
self.text = requests.get(self.url).text # TODO: Exception handling
# Wait 2 seconds to avoid stressing data source and rate-limiting
# You don't need to do this here (only has to happen between requests),
# but you should have it somewhere in your code
time.sleep(2)
def write_cache(self):
"""Save current .text attribute to text cache"""
# Create directory if it doesn't exist
directory = os.path.dirname(self.local_fn)
if not os.path.exists(directory):
os.makedirs(directory)
# Write text to local file cache
with open(self.local_fn, 'w') as fp:
fp.write(self.text)
def read_cache(self):
"""Read from text cache (file must exist) and save to .text attribute"""
with open(self.local_fn, 'r') as fp:
self.text = fp.read()
def run_example():
"""
Return a dictionary with key: book title, value: Text objects for books.
"""
import time
urls = { "Pride and Prejudice": "http://www.gutenberg.org/files/1342/1342-0.txt",
"Moby Dick": "http://www.gutenberg.org/files/2701/2701-0.txt",
"Iliad": "http://www.gutenberg.org/ebooks/6130.txt.utf-8"
}
texts = {}
for title, url in urls.items():
t = Text(url)
texts[title] = t
return texts
if __name__ == '__main__':
#import doctest
#doctest.testmod()
from pprint import pprint # "Pretty-print" dictionary
books = run_example()
pprint(books)
#for title, book in books.items():
# print(book.text[2000:2010])import sys