import requests
from bs4 import BeautifulSoup
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
Web kazıma için fonksiyon
def scrape_website(url):
response = requests.get(url)
if 'text/html' in response.headers['Content-Type']:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
return [{'title': article.h2.text, 'content': article.p.text} for article in articles]
else:
return "Non-HTML content received, cannot scrape."
robots.txt dosyasına saygı duymak için fonksiyon
def can_fetch(url):
try:
robots_url = f"{url}/robots.txt"
response = requests.get(robots_url)
if response.status_code == 200:
return 'User-agent: *' in response.text and 'Disallow:' not in response.text
return False
except requests.exceptions.RequestException:
return False
Web kazıma işlemi
website_url = 'http://example.com/articles'
if can_fetch(website_url):
scraped_data = scrape_website(website_url)
print(scraped_data)
else:
print("robots.txt rules do not allow scraping this website.")
Veri setini yükle
data = fetch_20newsgroups()
Eğitim ve test verilerini ayır
train_data, test_data, train_labels, test_labels = train_test_split(data.data, data.target, test_size=0.25, random_state=42)
Modeli oluştur
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
Modeli eğit
model.fit(train_data, train_labels)
Test verileri üzerinde tahmin yap
predicted_labels = model.predict(test_data)
Performansı değerlendir
print(confusion_matrix(test_labels, predicted_labels))
print(classification_report(test_labels, predicted_labels))
import requests
from bs4 import BeautifulSoup
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
Web kazıma için fonksiyon
def scrape_website(url):
response = requests.get(url)
if 'text/html' in response.headers['Content-Type']:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
return [{'title': article.h2.text, 'content': article.p.text} for article in articles]
else:
return "Non-HTML content received, cannot scrape."
robots.txt dosyasına saygı duymak için fonksiyon
def can_fetch(url):
try:
robots_url = f"{url}/robots.txt"
response = requests.get(robots_url)
if response.status_code == 200:
return 'User-agent: *' in response.text and 'Disallow:' not in response.text
return False
except requests.exceptions.RequestException:
return False
Web kazıma işlemi
website_url = 'http://example.com/articles'
if can_fetch(website_url):
scraped_data = scrape_website(website_url)
print(scraped_data)
else:
print("robots.txt rules do not allow scraping this website.")
Veri setini yükle
data = fetch_20newsgroups()
Eğitim ve test verilerini ayır
train_data, test_data, train_labels, test_labels = train_test_split(data.data, data.target, test_size=0.25, random_state=42)
Modeli oluştur
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
Modeli eğit
model.fit(train_data, train_labels)
Test verileri üzerinde tahmin yap
predicted_labels = model.predict(test_data)
Performansı değerlendir
print(confusion_matrix(test_labels, predicted_labels))
print(classification_report(test_labels, predicted_labels))