Skip to content

Commit 76f446f

Browse files
committed
feat: refactoring local scraping
1 parent f2cd856 commit 76f446f

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6+
<title>Sample Product Page</title>
7+
<style>
8+
body {
9+
font-family: Arial, sans-serif;
10+
max-width: 800px;
11+
margin: 0 auto;
12+
padding: 20px;
13+
}
14+
.product {
15+
border: 1px solid #ddd;
16+
border-radius: 8px;
17+
padding: 20px;
18+
margin-bottom: 20px;
19+
}
20+
.price {
21+
color: #e74c3c;
22+
font-size: 24px;
23+
font-weight: bold;
24+
}
25+
.features {
26+
list-style-type: none;
27+
padding: 0;
28+
}
29+
.features li:before {
30+
content: "✓ ";
31+
color: #27ae60;
32+
font-weight: bold;
33+
}
34+
</style>
35+
</head>
36+
<body>
37+
<div class="product">
38+
<h1>Premium Wireless Headphones</h1>
39+
<p class="price">€299.99</p>
40+
41+
<div class="description">
42+
<h2>Description</h2>
43+
<p>
44+
Experience crystal-clear audio with our premium wireless headphones.
45+
Featuring advanced noise cancellation technology and up to 30 hours
46+
of battery life, these headphones are perfect for music lovers and
47+
professionals alike.
48+
</p>
49+
</div>
50+
51+
<div class="features-section">
52+
<h2>Key Features</h2>
53+
<ul class="features">
54+
<li>Active Noise Cancellation (ANC)</li>
55+
<li>30-hour battery life</li>
56+
<li>Bluetooth 5.0 connectivity</li>
57+
<li>Premium leather ear cushions</li>
58+
<li>Foldable design with carry case</li>
59+
<li>Built-in microphone for calls</li>
60+
</ul>
61+
</div>
62+
63+
<div class="contact">
64+
<h2>Contact Information</h2>
65+
<p><strong>Email:</strong> support@example.com</p>
66+
<p><strong>Phone:</strong> +1 (555) 123-4567</p>
67+
<p><strong>Website:</strong> www.example.com</p>
68+
</div>
69+
70+
<div class="availability">
71+
<p><strong>Stock Status:</strong> In Stock</p>
72+
<p><strong>SKU:</strong> WH-1000XM5-BLK</p>
73+
<p><strong>Category:</strong> Electronics > Audio > Headphones</p>
74+
</div>
75+
</div>
76+
</body>
77+
</html>
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""
2+
SmartScraper with Local HTML File Example
3+
4+
This example demonstrates how to use SmartScraper with a local HTML file
5+
instead of fetching content from a URL. Perfect for:
6+
- Testing with static HTML files
7+
- Processing saved web pages
8+
- Working offline
9+
- Debugging and development
10+
11+
Requirements:
12+
- SGAI_API_KEY environment variable must be set
13+
"""
14+
15+
import os
16+
from pathlib import Path
17+
18+
from dotenv import load_dotenv
19+
20+
from scrapegraph_py import Client
21+
from scrapegraph_py.logger import sgai_logger
22+
23+
# Load environment variables from .env file
24+
load_dotenv()
25+
26+
sgai_logger.set_logging(level="INFO")
27+
28+
29+
def read_html_file(file_path: str) -> str:
30+
"""
31+
Read HTML content from a local file.
32+
33+
Args:
34+
file_path: Path to the HTML file
35+
36+
Returns:
37+
HTML content as string
38+
"""
39+
try:
40+
with open(file_path, "r", encoding="utf-8") as f:
41+
return f.read()
42+
except FileNotFoundError:
43+
print(f"❌ File not found: {file_path}")
44+
raise
45+
except Exception as e:
46+
print(f"❌ Error reading file: {str(e)}")
47+
raise
48+
49+
50+
def main():
51+
"""Extract data from a local HTML file using SmartScraper."""
52+
53+
# Initialize the client with API key from environment variable
54+
api_key = os.getenv("SGAI_API_KEY")
55+
if not api_key:
56+
print("❌ Error: SGAI_API_KEY environment variable not set")
57+
print("Please either:")
58+
print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'")
59+
print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here")
60+
return
61+
62+
# Path to the sample HTML file in the same directory
63+
script_dir = Path(__file__).parent
64+
html_file_path = script_dir / "sample_product.html"
65+
66+
# Check if the HTML file exists
67+
if not html_file_path.exists():
68+
print(f"❌ HTML file not found at: {html_file_path}")
69+
print(" Make sure sample_product.html exists in the sync/ directory")
70+
return
71+
72+
# Read the HTML file
73+
print(f"📂 Reading HTML file: {html_file_path.name}")
74+
html_content = read_html_file(str(html_file_path))
75+
76+
# Check file size (max 2MB)
77+
html_size_mb = len(html_content.encode("utf-8")) / (1024 * 1024)
78+
print(f"📊 HTML file size: {html_size_mb:.4f} MB")
79+
80+
if html_size_mb > 2:
81+
print("❌ HTML file exceeds 2MB limit")
82+
return
83+
84+
# Define what to extract
85+
user_prompt = "Extract the product name, price, description, all features, and contact information"
86+
87+
# Create client and scrape using local HTML
88+
sgai_client = Client(api_key=api_key)
89+
90+
print(f"🎯 Prompt: {user_prompt}")
91+
print()
92+
93+
# Pass website_html instead of website_url
94+
# Note: website_url should be empty string when using website_html
95+
response = sgai_client.smartscraper(
96+
website_url="", # Empty when using website_html
97+
user_prompt=user_prompt,
98+
website_html=html_content, # Pass the HTML content here
99+
)
100+
101+
# Print the response
102+
print("✅ Success! Extracted data from local HTML:")
103+
print()
104+
print(f"Request ID: {response['request_id']}")
105+
print(f"Result: {response['result']}")
106+
print()
107+
108+
sgai_client.close()
109+
110+
111+
if __name__ == "__main__":
112+
print("SmartScraper with Local HTML File Example")
113+
print("=" * 45)
114+
print()
115+
main()

0 commit comments

Comments
 (0)