forked from hassancs91/scraper-api
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic_scraper.py
More file actions
136 lines (111 loc) · 5.2 KB
/
basic_scraper.py
File metadata and controls
136 lines (111 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_url(url):
"""
Scrape the HTML content from a given URL
Args:
url (str): The URL to scrape
Returns:
tuple: (raw_html, parsed_html, status_code, headers)
"""
try:
# Add headers to mimic a browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Make the request
response = requests.get(url, headers=headers)
# Get the raw HTML
raw_html = response.text
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(raw_html, 'html.parser')
return raw_html, soup, response.status_code, response.headers
except Exception as e:
st.error(f"Error: {e}")
return None, None, None, None
# Set up the Streamlit UI
st.title("Basic Web Scraper")
# URL input
url = st.text_input("Enter a URL to scrape:", "https://example.com")
# Scraping options
with st.expander("Scraping Options"):
show_raw_html = st.checkbox("Show Raw HTML", value=True)
show_parsed_elements = st.checkbox("Show Parsed Elements", value=True)
show_headers = st.checkbox("Show Response Headers", value=False)
# Scrape button
if st.button("Scrape URL"):
with st.spinner("Scraping..."):
raw_html, soup, status_code, headers = scrape_url(url)
if raw_html:
st.success(f"Successfully scraped URL with status code: {status_code}")
# Display tabs for different views of the scraped content
tab1, tab2, tab3, tab4 = st.tabs(["Raw HTML", "Parsed Elements", "Headers", "Data Preview"])
with tab1:
if show_raw_html:
st.subheader("Raw HTML")
st.code(raw_html, language="html")
else:
st.info("Enable 'Show Raw HTML' to view the raw HTML content.")
with tab2:
if show_parsed_elements and soup:
st.subheader("Parsed Elements")
# Show all links
st.write("### Links")
links = soup.find_all('a')
if links:
link_data = []
for link in links[:20]: # Limit to first 20 links
link_data.append({
"Text": link.text.strip(),
"URL": link.get('href', '')
})
st.dataframe(pd.DataFrame(link_data))
if len(links) > 20:
st.info(f"Showing 20 of {len(links)} links found.")
else:
st.write("No links found.")
# Show all headers
st.write("### Headers (h1, h2, h3)")
headers_elements = soup.find_all(['h1', 'h2', 'h3'])
if headers_elements:
header_data = []
for header in headers_elements:
header_data.append({
"Type": header.name,
"Text": header.text.strip()
})
st.dataframe(pd.DataFrame(header_data))
else:
st.write("No headers found.")
else:
st.info("Enable 'Show Parsed Elements' to view the parsed elements.")
with tab3:
if show_headers:
st.subheader("Response Headers")
st.json(dict(headers))
else:
st.info("Enable 'Show Response Headers' to view the headers.")
with tab4:
st.subheader("Data Preview")
st.write("This tab is where you would display extracted data in a structured format.")
st.write("For example, if you were scraping a product page, you might extract:")
# Example of what extracted data might look like
example_data = {
"Title": soup.title.text if soup.title else "N/A",
"URL": url,
"Number of links": len(soup.find_all('a')) if soup else 0,
"Number of images": len(soup.find_all('img')) if soup else 0,
}
st.json(example_data)
else:
st.error("Failed to scrape the URL. Please check the URL and try again.")
# Add some helpful information
st.markdown("---")
st.info("""
This is a basic web scraper for testing purposes. Be mindful of the following:
1. Some websites do not allow scraping and may block your requests
2. Always check a website's robots.txt file and terms of service before scraping
3. This is a simple demo and lacks features like request throttling, proxies, or handling of JavaScript-rendered content
""")