From a7ef85901c05b7f8800d66b0e11a39662236e937 Mon Sep 17 00:00:00 2001 From: Pratik Shelke <115389960+gitpratikshelke@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:22:35 +0530 Subject: [PATCH] Created preprocess.py --- preprocess.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 preprocess.py diff --git a/preprocess.py b/preprocess.py new file mode 100644 index 0000000..dbc905a --- /dev/null +++ b/preprocess.py @@ -0,0 +1,104 @@ +import torch +from transformers import CLIPProcessor, CLIPModel +from PIL import Image +import os +import pandas as pd +import easyocr +import re + +# Initialize EasyOCR Reader (for English text) +reader = easyocr.Reader(['en']) + +# Initialize the CLIP model and processor +model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") +processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + +# Define the meme image directory +meme_directory = "E:/BE Project/archive (3)/memes" + +# Function to preprocess text +def preprocess_text(text): + # Remove email addresses + text = re.sub(r'\S+@\S+\.\S+', '', text) + # Remove URLs + text = re.sub(r'http\S+|www\S+', '', text) + # Remove special characters and numbers (keep alphabets and spaces) + text = re.sub(r'[^A-Za-z\s]', '', text) + # Remove multiple spaces + text = re.sub(r'\s+', ' ', text).strip() + return text + +# Function to classify toxicity based on image and text +def classify_toxicity(image_path, text): + # Open and preprocess the image + image = Image.open(image_path) + + # Prepare the inputs (image and text) with padding and truncation + inputs = processor( + text=[text], + images=image, + return_tensors="pt", + padding=True, # Padding the sequence to the maximum length + truncation=True # Truncating if the sequence exceeds the max length + ) + + # Get the outputs from the model + outputs = model(**inputs) + + # Extract the image and text features + image_features = outputs.image_embeds + text_features = outputs.text_embeds + + # Calculate the similarity score (cosine similarity) + similarity = torch.cosine_similarity(image_features, text_features) + + # Define a threshold for toxicity (example threshold) + toxicity_threshold = 0.4 + + # If the similarity score is below the threshold, classify as toxic + label = "Toxic" if similarity < toxicity_threshold else "Non-toxic" + + return label, similarity.item() + +# Function to extract text from image using EasyOCR +def extract_text_from_image(image_path): + # Perform OCR using EasyOCR + result = reader.readtext(image_path) + # Combine all the text found in the image + text = " ".join([entry[1] for entry in result]) + # Preprocess the extracted text + text = preprocess_text(text) + return text.strip() + +# Process all meme images in the directory +results = [] +for filename in os.listdir(meme_directory): + if filename.endswith(".jpg") or filename.endswith(".png"): + image_path = os.path.join(meme_directory, filename) + + # Extract text from image + text = extract_text_from_image(image_path) + + # If no text was extracted, skip the image + if not text: + continue + + # Classify toxicity based on image and extracted text + label, score = classify_toxicity(image_path, text) + + # Store results + results.append({ + "image": filename, + "extracted_text": text, + "toxicity_label": label, + "toxicity_score": score + }) + +# Convert results into a DataFrame +df = pd.DataFrame(results) + +# Save results to a CSV file +df.to_csv("meme_toxicity_results.csv", index=False) + +# Print the DataFrame if needed +print(df.head())