-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathparser.py
More file actions
79 lines (62 loc) · 2.71 KB
/
parser.py
File metadata and controls
79 lines (62 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import requests
from bs4 import BeautifulSoup
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
def fetch_olympiad_page(url):
print(f"Loading URL: {url}")
return requests.get(url).content
def extract_olympiad_data(html):
soup = BeautifulSoup(html, 'html.parser')
parsed_data = {}
meta_tags = {'description': 'name', 'keywords': 'name', 'image': 'property', 'url': 'property'}
for key, attr in meta_tags.items():
meta = soup.find('meta', {attr: f"og:{key}" if key in ['image', 'url'] else key})
if meta:
parsed_data[key] = meta.get('content')
if soup.title:
parsed_data['title'] = soup.title.string.strip()
events = []
for row in soup.select('table tr'):
event_name = row.find('div', class_='event_name')
event_date = row.find_all('a')
if event_name and len(event_date) > 1:
events.append({
'name': event_name.get_text(strip=True),
'date': event_date[1].get_text(strip=True)
})
if events:
parsed_data['events'] = events
return parsed_data
def fetch_all_olympiad_links(main_url):
soup = BeautifulSoup(fetch_olympiad_page(main_url), 'html.parser')
return [f'https://olimpiada.ru{link["href"]}' for link in soup.find_all('a', href=True) if '/activity/' in link['href']]
def process_olympiad(link):
try:
return extract_olympiad_data(fetch_olympiad_page(link))
except Exception as e:
print(f"Error fetching data from {link}: {e}")
return None
def dict_to_string(data):
return "\n".join(f"{key}: {value}" for key, value in data.items() if key != 'events') + \
"\nEvents:\n" + "\n".join(f"- {event['name']} ({event['date']})" for event in data.get('events', []))
def main():
main_url = 'https://olimpiada.ru/activities'
olympiad_links = fetch_all_olympiad_links(main_url)
print(f"Found {len(olympiad_links)} olympiad links")
all_olympiad_data = []
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_link = {executor.submit(process_olympiad, link): link for link in olympiad_links}
for future in as_completed(future_to_link):
data = future.result()
if data:
all_olympiad_data.append(data)
time.sleep(0.2)
# Transform the list of dictionaries to a list of strings
# Save the list of strings to a JSON file
with open('all_olympiads_strings.json', 'w', encoding='utf-8') as json_file:
json.dump(all_olympiad_data, json_file, ensure_ascii=False, indent=2)
print("Data has been saved to all_olympiads_strings.json")
return all_olympiad_data
if __name__ == '__main__':
main()