forked from apurvsinghgautam/robin
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
79 lines (71 loc) · 3.21 KB
/
scrape.py
File metadata and controls
79 lines (71 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import random
import requests
import threading
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings("ignore")
# Define a list of rotating user agents.
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
]
# Global counter and lock for thread-safe Tor rotation
request_counter = 0
counter_lock = threading.Lock()
def scrape_single(url_data, rotate=False, rotate_interval=5, control_port=9051, control_password=None):
"""
Scrapes a single URL.
If the URL is an onion site, routes the request through Tor.
Returns a tuple (url, scraped_text).
"""
url = url_data['link']
use_tor = ".onion" in url
proxies = None
if use_tor:
proxies = {
"http": "socks5h://127.0.0.1:9050",
"https": "socks5h://127.0.0.1:9050"
}
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=30)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
scraped_text = url_data['title'] + soup.get_text().replace('\n', ' ').replace('\r', '')
else:
scraped_text = url_data['title']
except:
scraped_text = url_data['title']
return url, scraped_text
def scrape_multiple(urls_data, max_workers=5):
"""
Scrapes multiple URLs concurrently using a thread pool.
Parameters:
- urls_data: list of URLs to scrape.
- max_workers: number of concurrent threads for scraping.
Returns:
A dictionary mapping each URL to its scraped content.
"""
results = {}
max_chars = 1200 # Taking first n chars from the scraped data
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(scrape_single, url_data): url_data
for url_data in urls_data
}
for future in as_completed(future_to_url):
url, content = future.result()
if len(content) > max_chars:
content = content[:max_chars]
results[url] = content
return results