-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaSentenceExtractor.py
More file actions
83 lines (65 loc) · 1.85 KB
/
aSentenceExtractor.py
File metadata and controls
83 lines (65 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
1. Make a program that goes to https://sentence.yourdictionary.com/the and gets all the sentences
2. Add the words into a list
3. Save the list onto a file
"""
from bs4 import BeautifulSoup
import requests
import re
MyURL = "https://sentence.yourdictionary.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def remove_double_spaces(text):
if text[0] == " ":
text = text[1:]
return re.sub(' +', ' ', text)
def remove_space(string):
if string[0] == " ":
return string[1:]
else:
return string
myList = []
def tidySentence(sentence):
sentence.lower()
sentence.capitalize()
return remove_double_spaces(remove_space(sentence))
def remove_every_other(my_list):
return my_list[::2]
def GT(Soup):
Text = Soup.get_text()
return Text
def getText(url):
MyRequest = requests.get(url, headers=headers)
Soup = BeautifulSoup(MyRequest.text, "lxml")
return GT(Soup)
def getFile():
myList = []
if isinstance(MyURL, str):
txt = getText(MyURL)
elif isinstance(MyURL, list):
txt = []
for url in MyURL:
for x in getText(url):
txt.append(x)
txt = txt.split(' ')
for x in txt:
if x != '' and (len(x) > 10) and not (x == " Advertisement\n"):
myList.append(x)
for x in range(88):
try:
myList.pop(0)
except:
print("Error")
break
url = MyURL[36:]
myFile = open("%s.txt" %url, "w")
for x in myList:
myFile.write(tidySentence(x) + '\n')
myFile.close()
while True:
print("put in your input")
word = input("")
if word == "quit":
break
else:
MyURL = "https://sentence.yourdictionary.com/%s" %word
getFile()