From 66b2e36a238a843c4a231d88dbf15ff35d95bce5 Mon Sep 17 00:00:00 2001 From: Madhur Date: Wed, 19 Nov 2025 05:02:34 +0530 Subject: [PATCH 1/3] Add LSTM based IMDB sentiment analysis example --- examples/nlp/imdb_lstm_sentiment.py | 155 ++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 examples/nlp/imdb_lstm_sentiment.py diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py new file mode 100644 index 0000000000..02ebf8799c --- /dev/null +++ b/examples/nlp/imdb_lstm_sentiment.py @@ -0,0 +1,155 @@ +""" +Title: Sentiment analysis with LSTM on the IMDB dataset +Author:Madhur Jain +Date created: 2025/11/19 +Last Modified: 2025/11/19 +Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews. +""" + +""" +## Introduction + +LSTM refers to Long short term memories, that is while predicting it not only keeps short term memory but also long term memory +LSTM uses sigmoid activation functions and tan-h activation functions: + The Sigmoid fn. ranges the values from 0 to 1, + tan-h function ranges the values from -1 to 1. +Doesn't let Gradient Descent of long term memory to vanish or short term memory to completely explode. +It contains 3 stages: + 1st stage: Determines what % of long term memory is remembered- c/a Forget Gate + 2nd stage: Determines how we would update long-term memory- c/a Input Gate + 3rd stage: Updates short term memory and it is the output of the entire stage- c/a Output Gate + +If you wanna know more deeply about it, I would recommend to watch Stanford Online: statistacl analysis with Python course lectures available on Youtube (for free) + +""" + +import os +os.environ["KERAS_BACKEND"] = "tensorflow" + +import keras +import tensorflow as tf +from keras import layers +from keras.models import Sequential + +""" +## Load the dataset +get the kAGGLE.json from your kaggle account->settings->create new token +""" +kaggle_dictionary = json.load(open("kaggle.json")) #converts json object to python dictionary +#Setup Kaggle collection as env vars +kaggle_dictionary.keys() + +os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"] +os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"] + +# unzip the dataset file +with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref: + zip_ref.extractall() + +#loading the dataset +data = pd.read_csv("/content/IMDB Dataset.csv") + +data.shape + +data.info() + +data.head() + +data["sentiment"].value_counts() + +data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True) + +data.head() + +data["sentiment"].value_counts() + + +""" +## Splitting into Training and test set +""" +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +print(train_data.shape) +print(test_data.shape) + + +""" +## Data Processing +""" +#Tokenize text data +# for text data one have to tokenize(convert words to integer in short) the data and stuff +tokenizer = Tokenizer(num_words=5000) +tokenizer.fit_on_texts(train_data["review"]) +X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200) +X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200) + +print(X_train) +print(X_test) + +Y_train = train_data["sentiment"] +Y_test = test_data["sentiment"] + +print(Y_train) +print(Y_test) + + +""" +## LSTM (Long Short Term Memory) Model +""" +# build the model + +model = Sequential() #sequential model +#add layers +model.add(Embedding(input_dim=5000, output_dim=128, input_shape=(200,))) +model.add(LSTM(128, dropout=0.2)) +model.add(Dense(1, activation="sigmoid")) + +model.summary() + +# compile the model +model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) +""" +## Training the Model +""" +model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2) + +""" +## Model Evaluation +""" +loss, accuracy = model.evaluate(X_test, Y_test) +print(f"Test Loss: {loss}") +print(f"Test Accuracy: {accuracy}") + +""" +### Predicting Values +""" +def predict_sentiment(review): + # tokenize and pad the review + sequence = tokenizer.texts_to_sequences([review]) + padded_sequence = pad_sequences(sequence, maxlen=200) + prediction = model.predict(padded_sequence) + sentiment = "positive" if prediction[0][0] > 0.5 else "negative" + return sentiment + + +# examples + +new_review = "This movie was fantastic. I loved it." +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "This movie was not that good" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "Great movie but could have added a better action scene" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +#===================================================================================# +new_review = "Mid movie" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") +# ==================================================================================# +new_review = "I laughing while shitting damn what a watch" +sentiment = predict_sentiment(new_review) +print(f"The sentiment of the review is: {sentiment}") \ No newline at end of file From f9dfb81105933b0b440d3193716cd208339cf090 Mon Sep 17 00:00:00 2001 From: Madhur Date: Tue, 2 Dec 2025 18:54:01 +0530 Subject: [PATCH 2/3] adjusting final changes(ought to be) --- examples/nlp/imdb_lstm_sentiment.py | 283 +++++++++++++++------------- 1 file changed, 150 insertions(+), 133 deletions(-) diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py index 02ebf8799c..68a70f314c 100644 --- a/examples/nlp/imdb_lstm_sentiment.py +++ b/examples/nlp/imdb_lstm_sentiment.py @@ -1,155 +1,172 @@ """ Title: Sentiment analysis with LSTM on the IMDB dataset -Author:Madhur Jain +Author: Madhur Jain Date created: 2025/11/19 -Last Modified: 2025/11/19 -Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews. -""" - -""" -## Introduction - -LSTM refers to Long short term memories, that is while predicting it not only keeps short term memory but also long term memory -LSTM uses sigmoid activation functions and tan-h activation functions: - The Sigmoid fn. ranges the values from 0 to 1, - tan-h function ranges the values from -1 to 1. -Doesn't let Gradient Descent of long term memory to vanish or short term memory to completely explode. -It contains 3 stages: - 1st stage: Determines what % of long term memory is remembered- c/a Forget Gate - 2nd stage: Determines how we would update long-term memory- c/a Input Gate - 3rd stage: Updates short term memory and it is the output of the entire stage- c/a Output Gate - -If you wanna know more deeply about it, I would recommend to watch Stanford Online: statistacl analysis with Python course lectures available on Youtube (for free) - +Last Modified: 2025/11/24 (Refactored) +Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews, + demonstrating the modern Keras TextVectorization layer. """ import os -os.environ["KERAS_BACKEND"] = "tensorflow" - +import shutil # For removing the 'unsup' directory import keras -import tensorflow as tf +import tensorflow as tf # Needed for tf.data.Dataset +import pandas as pd from keras import layers from keras.models import Sequential +from keras.layers import TextVectorization # Modern Keras text preprocessing -""" -## Load the dataset -get the kAGGLE.json from your kaggle account->settings->create new token -""" -kaggle_dictionary = json.load(open("kaggle.json")) #converts json object to python dictionary -#Setup Kaggle collection as env vars -kaggle_dictionary.keys() - -os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"] -os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"] - -# unzip the dataset file -with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref: - zip_ref.extractall() - -#loading the dataset -data = pd.read_csv("/content/IMDB Dataset.csv") - -data.shape - -data.info() - -data.head() - -data["sentiment"].value_counts() - -data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True) - -data.head() - -data["sentiment"].value_counts() - - -""" -## Splitting into Training and test set -""" -train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) - -print(train_data.shape) -print(test_data.shape) - +# Set the Keras backend +os.environ["KERAS_BACKEND"] = "tensorflow" -""" -## Data Processing -""" -#Tokenize text data -# for text data one have to tokenize(convert words to integer in short) the data and stuff -tokenizer = Tokenizer(num_words=5000) -tokenizer.fit_on_texts(train_data["review"]) -X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200) -X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200) -print(X_train) -print(X_test) +## Load the dataset 💾 + +# URL for the raw IMDB dataset (aclImdb_v1.tar.gz) +data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + +# Use keras.utils.get_file to download and extract the dataset +# This ensures portability across different environments (Colab, local machine, etc.) +dataset_path = keras.utils.get_file( + "aclImdb_v1.tar.gz", data_url, untar=True, cache_dir=".", cache_subdir="" +) +main_dir = os.path.join(dataset_path, "aclImdb") +train_dir = os.path.join(main_dir, "train") +test_dir = os.path.join(main_dir, "test") + +# The IMDB dataset includes an 'unsup' (unsupervised) directory in train that should be ignored +remove_dir = os.path.join(train_dir, "unsup") +if os.path.exists(remove_dir): + shutil.rmtree(remove_dir) + + +# Helper function to load the data from the extracted files into a DataFrame +def load_data_from_dir(directory): + """Loads text reviews and their labels from a directory.""" + reviews, sentiments = [], [] + for sentiment_type, sentiment_label in [("pos", 1), ("neg", 0)]: + sentiment_dir = os.path.join(directory, sentiment_type) + for fname in os.listdir(sentiment_dir): + if fname.endswith(".txt"): + # Use a standard, safe encoding + with open(os.path.join(sentiment_dir, fname), encoding="utf-8") as f: + reviews.append(f.read()) + sentiments.append(sentiment_label) + return pd.DataFrame({"review": reviews, "sentiment": sentiments}) + +# Load dataframes directly +train_df = load_data_from_dir(train_dir) +test_df = load_data_from_dir(test_dir) + +# The data is already split into 25k train and 25k test reviews +# Separate the features (X) and labels (Y) +X_train_text = train_df["review"] +Y_train = train_df["sentiment"] +X_test_text = test_df["review"] +Y_test = test_df["sentiment"] + +print(f"Training samples: {len(X_train_text)}, Test samples: {len(X_test_text)}") +# + +--- + +## 🧠 Data Processing with TextVectorization + +# Hyperparameters for the TextVectorization layer +max_features = 5000 # Only consider the top N words +sequence_length = 200 # Pad/truncate all sequences to a fixed length +embedding_dim = 128 # Size of the output vector for each word + +# 1. Create the TextVectorization layer +vectorize_layer = TextVectorization( + max_tokens=max_features, + output_mode="int", # Outputs integer indices + output_sequence_length=sequence_length, +) + +# 2. Adapt the layer to the training text +# Adapt() builds the vocabulary based on the training data. +# We convert the Pandas Series to a batched TensorFlow Dataset for efficient adaptation. +text_ds = tf.data.Dataset.from_tensor_slices(X_train_text.values).batch(128) +vectorize_layer.adapt(text_ds) + +# Optional: Inspect the vocabulary +# print("Vocabulary size:", len(vectorize_layer.get_vocabulary())) +# print("Top 10 words in vocabulary:", vectorize_layer.get_vocabulary()[:10]) + +# The text and labels are now ready to be passed directly to model.fit + +--- + +## 🏗️ LSTM Model (End-to-End) + +# The TextVectorization layer is included directly in the Sequential model, +# making the model "end-to-end" (it accepts raw strings). + +model = Sequential([ + # 1. Input: TextVectorization layer (accepts raw string, outputs integer sequences) + vectorize_layer, + # 2. Embedding Layer: Maps integer indices to dense vectors + layers.Embedding(input_dim=max_features, output_dim=embedding_dim, mask_zero=True), + # 3. LSTM Layer: Recurrent layer for sequential processing + layers.LSTM(128, dropout=0.2), + # 4. Dense Output Layer: Binary classification with sigmoid activation + layers.Dense(1, activation="sigmoid") +]) -Y_train = train_data["sentiment"] -Y_test = test_data["sentiment"] +model.summary() +# -print(Y_train) -print(Y_test) +# Compile the model +model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) +--- -""" -## LSTM (Long Short Term Memory) Model -""" -# build the model +## 🏃 Training and Evaluation -model = Sequential() #sequential model -#add layers -model.add(Embedding(input_dim=5000, output_dim=128, input_shape=(200,))) -model.add(LSTM(128, dropout=0.2)) -model.add(Dense(1, activation="sigmoid")) +print("\n## Training the Model") +# The model can now be trained by passing the raw text and labels. +model.fit( + X_train_text, # Raw text input + Y_train, # Integer labels + epochs=5, + batch_size=64, + validation_split=0.2 +) -model.summary() +print("\n## Model Evaluation") +# Note: For evaluation, use the raw text from the test set +loss, accuracy = model.evaluate(X_test_text, Y_test) +print(f"Test Loss: {loss:.4f}") +print(f"Test Accuracy: {accuracy:.4f}") -# compile the model -model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) -""" -## Training the Model -""" -model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2) +--- -""" -## Model Evaluation -""" -loss, accuracy = model.evaluate(X_test, Y_test) -print(f"Test Loss: {loss}") -print(f"Test Accuracy: {accuracy}") +## 🔮 Predicting Values (Simplified Inference) -""" -### Predicting Values -""" def predict_sentiment(review): - # tokenize and pad the review - sequence = tokenizer.texts_to_sequences([review]) - padded_sequence = pad_sequences(sequence, maxlen=200) - prediction = model.predict(padded_sequence) - sentiment = "positive" if prediction[0][0] > 0.5 else "negative" - return sentiment - - -# examples - -new_review = "This movie was fantastic. I loved it." -sentiment = predict_sentiment(new_review) -print(f"The sentiment of the review is: {sentiment}") -#===================================================================================# -new_review = "This movie was not that good" -sentiment = predict_sentiment(new_review) -print(f"The sentiment of the review is: {sentiment}") -#===================================================================================# -new_review = "Great movie but could have added a better action scene" -sentiment = predict_sentiment(new_review) -print(f"The sentiment of the review is: {sentiment}") -#===================================================================================# -new_review = "Mid movie" -sentiment = predict_sentiment(new_review) -print(f"The sentiment of the review is: {sentiment}") -# ==================================================================================# -new_review = "I laughing while shitting damn what a watch" -sentiment = predict_sentiment(new_review) -print(f"The sentiment of the review is: {sentiment}") \ No newline at end of file + """Predicts sentiment for a raw text review using the end-to-end model.""" + # The model accepts a list/array of raw strings directly + prediction = model.predict([review]) + + # Sigmoid output is a probability + sentiment = "positive" if prediction[0][0] > 0.5 else "negative" + probability = prediction[0][0] + return sentiment, probability + +# Examples +print("\n### Predicting Values") +examples = [ + "This movie was fantastic. I loved it.", + "This movie was not that good", + "Great movie but could have added a better action scene", + "Mid movie" +] + +for review in examples: + sentiment, prob = predict_sentiment(review) + print(f"Review: '{review[:30]}...' -> Sentiment: {sentiment} ({prob:.2f})") + +# Clean up the downloaded directory +if os.path.exists(main_dir): + shutil.rmtree(main_dir) \ No newline at end of file From 8aec99562a7ac40f4cbb9039a0aad168a1862d68 Mon Sep 17 00:00:00 2001 From: Madhur Date: Tue, 2 Dec 2025 19:18:25 +0530 Subject: [PATCH 3/3] correcting suggested changes --- examples/nlp/imdb_lstm_sentiment.py | 30 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py index 68a70f314c..550f727cd1 100644 --- a/examples/nlp/imdb_lstm_sentiment.py +++ b/examples/nlp/imdb_lstm_sentiment.py @@ -2,7 +2,7 @@ Title: Sentiment analysis with LSTM on the IMDB dataset Author: Madhur Jain Date created: 2025/11/19 -Last Modified: 2025/11/24 (Refactored) +Last Modified: 2025/12/02 Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews, demonstrating the modern Keras TextVectorization layer. """ @@ -14,7 +14,6 @@ import pandas as pd from keras import layers from keras.models import Sequential -from keras.layers import TextVectorization # Modern Keras text preprocessing # Set the Keras backend os.environ["KERAS_BACKEND"] = "tensorflow" @@ -30,7 +29,7 @@ dataset_path = keras.utils.get_file( "aclImdb_v1.tar.gz", data_url, untar=True, cache_dir=".", cache_subdir="" ) -main_dir = os.path.join(dataset_path, "aclImdb") +main_dir = os.path.join(os.path.dirname(dataset_path), "aclImdb") train_dir = os.path.join(main_dir, "train") test_dir = os.path.join(main_dir, "test") @@ -78,7 +77,7 @@ def load_data_from_dir(directory): embedding_dim = 128 # Size of the output vector for each word # 1. Create the TextVectorization layer -vectorize_layer = TextVectorization( +vectorize_layer = layers.TextVectorization( max_tokens=max_features, output_mode="int", # Outputs integer indices output_sequence_length=sequence_length, @@ -140,18 +139,18 @@ def load_data_from_dir(directory): print(f"Test Loss: {loss:.4f}") print(f"Test Accuracy: {accuracy:.4f}") ---- - ## 🔮 Predicting Values (Simplified Inference) +# We use this function for single, quick predictions if needed, +# but the batch method below is preferred for multiple examples. def predict_sentiment(review): """Predicts sentiment for a raw text review using the end-to-end model.""" # The model accepts a list/array of raw strings directly - prediction = model.predict([review]) + prediction = model.predict([review]) # Sigmoid output is a probability - sentiment = "positive" if prediction[0][0] > 0.5 else "negative" probability = prediction[0][0] + sentiment = "positive" if probability > 0.5 else "negative" return sentiment, probability # Examples @@ -163,10 +162,17 @@ def predict_sentiment(review): "Mid movie" ] -for review in examples: - sentiment, prob = predict_sentiment(review) +# Predict on the batch of examples for efficiency +predictions = model.predict(examples) + +for review, prediction in zip(examples, predictions): + prob = prediction[0] + sentiment = "positive" if prob > 0.5 else "negative" print(f"Review: '{review[:30]}...' -> Sentiment: {sentiment} ({prob:.2f})") -# Clean up the downloaded directory +# Clean up the downloaded directory and archive if os.path.exists(main_dir): - shutil.rmtree(main_dir) \ No newline at end of file + shutil.rmtree(main_dir) +# This removes the downloaded archive file (e.g., aclImdb_v1.tar.gz) +if os.path.exists(dataset_path): + os.remove(dataset_path) \ No newline at end of file