From 66b2e36a238a843c4a231d88dbf15ff35d95bce5 Mon Sep 17 00:00:00 2001
From: Madhur <madhurjain.kota@gmail.com>
Date: Wed, 19 Nov 2025 05:02:34 +0530
Subject: [PATCH 1/3] Add LSTM based IMDB sentiment analysis example

---
 examples/nlp/imdb_lstm_sentiment.py | 155 ++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 examples/nlp/imdb_lstm_sentiment.py

diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py
new file mode 100644
index 0000000000..02ebf8799c
--- /dev/null
+++ b/examples/nlp/imdb_lstm_sentiment.py
@@ -0,0 +1,155 @@
+"""
+Title: Sentiment analysis with LSTM on the IMDB dataset
+Author:Madhur Jain
+Date created: 2025/11/19
+Last Modified: 2025/11/19
+Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews.
+"""
+
+"""
+## Introduction
+
+LSTM refers to Long short term memories, that is while predicting it not only keeps short term memory but also long term memory
+LSTM uses sigmoid activation functions and tan-h activation functions:
+    The Sigmoid fn. ranges the values from 0 to 1,
+    tan-h function ranges the values from -1 to 1.
+Doesn't let Gradient Descent of long term memory to vanish or short term memory to completely explode.
+It contains 3 stages: 
+    1st stage: Determines what % of long term memory is remembered- c/a Forget Gate
+    2nd stage: Determines how we would update long-term memory- c/a Input Gate
+    3rd stage: Updates short term memory and it is the output of the entire stage- c/a Output Gate
+
+If you wanna know more deeply about it, I would recommend to watch Stanford Online: statistacl analysis with Python course lectures available on Youtube (for free)
+
+"""
+
+import os
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import tensorflow as tf
+from keras import layers
+from keras.models import Sequential
+
+"""
+## Load the dataset
+get the kAGGLE.json from your kaggle account->settings->create new token
+"""
+kaggle_dictionary = json.load(open("kaggle.json"))  #converts json object to python dictionary
+#Setup Kaggle collection as env vars
+kaggle_dictionary.keys()
+
+os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
+os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]
+
+# unzip the dataset file
+with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
+  zip_ref.extractall()
+
+#loading the dataset
+data = pd.read_csv("/content/IMDB Dataset.csv")
+
+data.shape
+
+data.info()
+
+data.head()
+
+data["sentiment"].value_counts()
+
+data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)
+
+data.head()
+
+data["sentiment"].value_counts()
+
+
+"""
+## Splitting into Training and test set 
+"""
+train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
+
+print(train_data.shape)
+print(test_data.shape)
+
+
+"""
+## Data Processing
+"""
+#Tokenize text data
+# for text data one have to tokenize(convert words to integer in short) the data and stuff
+tokenizer = Tokenizer(num_words=5000)
+tokenizer.fit_on_texts(train_data["review"])
+X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
+X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)
+
+print(X_train)
+print(X_test)
+
+Y_train = train_data["sentiment"]
+Y_test = test_data["sentiment"]
+
+print(Y_train)
+print(Y_test)
+
+
+"""
+## LSTM (Long Short Term Memory) Model
+"""
+# build the model
+
+model = Sequential()  #sequential model
+#add layers
+model.add(Embedding(input_dim=5000, output_dim=128, input_shape=(200,)))
+model.add(LSTM(128, dropout=0.2))
+model.add(Dense(1, activation="sigmoid"))
+
+model.summary()
+
+# compile the model
+model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+"""
+## Training the Model
+"""
+model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)
+
+"""
+## Model Evaluation
+"""
+loss, accuracy = model.evaluate(X_test, Y_test)
+print(f"Test Loss: {loss}")
+print(f"Test Accuracy: {accuracy}")
+
+"""
+### Predicting Values
+"""
+def predict_sentiment(review):
+  # tokenize and pad the review
+  sequence = tokenizer.texts_to_sequences([review])
+  padded_sequence = pad_sequences(sequence, maxlen=200)
+  prediction = model.predict(padded_sequence)
+  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
+  return sentiment
+
+
+#  examples
+
+new_review = "This movie was fantastic. I loved it."
+sentiment = predict_sentiment(new_review)
+print(f"The sentiment of the review is: {sentiment}")
+#===================================================================================#
+new_review = "This movie was not that good"
+sentiment = predict_sentiment(new_review)
+print(f"The sentiment of the review is: {sentiment}")
+#===================================================================================#
+new_review = "Great movie but could have added a better action scene"
+sentiment = predict_sentiment(new_review)
+print(f"The sentiment of the review is: {sentiment}")
+#===================================================================================#
+new_review = "Mid movie"
+sentiment = predict_sentiment(new_review)
+print(f"The sentiment of the review is: {sentiment}")
+# ==================================================================================#
+new_review = "I laughing while shitting damn what a watch"
+sentiment = predict_sentiment(new_review)
+print(f"The sentiment of the review is: {sentiment}")
\ No newline at end of file

From f9dfb81105933b0b440d3193716cd208339cf090 Mon Sep 17 00:00:00 2001
From: Madhur <madhurjain.kota@gmail.com>
Date: Tue, 2 Dec 2025 18:54:01 +0530
Subject: [PATCH 2/3] adjusting final changes(ought to be)

---
 examples/nlp/imdb_lstm_sentiment.py | 283 +++++++++++++++-------------
 1 file changed, 150 insertions(+), 133 deletions(-)

diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py
index 02ebf8799c..68a70f314c 100644
--- a/examples/nlp/imdb_lstm_sentiment.py
+++ b/examples/nlp/imdb_lstm_sentiment.py
@@ -1,155 +1,172 @@
 """
 Title: Sentiment analysis with LSTM on the IMDB dataset
-Author:Madhur Jain
+Author: Madhur Jain
 Date created: 2025/11/19
-Last Modified: 2025/11/19
-Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews.
-"""
-
-"""
-## Introduction
-
-LSTM refers to Long short term memories, that is while predicting it not only keeps short term memory but also long term memory
-LSTM uses sigmoid activation functions and tan-h activation functions:
-    The Sigmoid fn. ranges the values from 0 to 1,
-    tan-h function ranges the values from -1 to 1.
-Doesn't let Gradient Descent of long term memory to vanish or short term memory to completely explode.
-It contains 3 stages: 
-    1st stage: Determines what % of long term memory is remembered- c/a Forget Gate
-    2nd stage: Determines how we would update long-term memory- c/a Input Gate
-    3rd stage: Updates short term memory and it is the output of the entire stage- c/a Output Gate
-
-If you wanna know more deeply about it, I would recommend to watch Stanford Online: statistacl analysis with Python course lectures available on Youtube (for free)
-
+Last Modified: 2025/11/24 (Refactored)
+Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews,
+             demonstrating the modern Keras TextVectorization layer.
 """
 
 import os
-os.environ["KERAS_BACKEND"] = "tensorflow"
-
+import shutil # For removing the 'unsup' directory
 import keras
-import tensorflow as tf
+import tensorflow as tf # Needed for tf.data.Dataset
+import pandas as pd
 from keras import layers
 from keras.models import Sequential
+from keras.layers import TextVectorization # Modern Keras text preprocessing
 
-"""
-## Load the dataset
-get the kAGGLE.json from your kaggle account->settings->create new token
-"""
-kaggle_dictionary = json.load(open("kaggle.json"))  #converts json object to python dictionary
-#Setup Kaggle collection as env vars
-kaggle_dictionary.keys()
-
-os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
-os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]
-
-# unzip the dataset file
-with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
-  zip_ref.extractall()
-
-#loading the dataset
-data = pd.read_csv("/content/IMDB Dataset.csv")
-
-data.shape
-
-data.info()
-
-data.head()
-
-data["sentiment"].value_counts()
-
-data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)
-
-data.head()
-
-data["sentiment"].value_counts()
-
-
-"""
-## Splitting into Training and test set 
-"""
-train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
-
-print(train_data.shape)
-print(test_data.shape)
-
+# Set the Keras backend
+os.environ["KERAS_BACKEND"] = "tensorflow"
 
-"""
-## Data Processing
-"""
-#Tokenize text data
-# for text data one have to tokenize(convert words to integer in short) the data and stuff
-tokenizer = Tokenizer(num_words=5000)
-tokenizer.fit_on_texts(train_data["review"])
-X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
-X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)
 
-print(X_train)
-print(X_test)
+## Load the dataset 💾
+
+# URL for the raw IMDB dataset (aclImdb_v1.tar.gz)
+data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+
+# Use keras.utils.get_file to download and extract the dataset
+# This ensures portability across different environments (Colab, local machine, etc.)
+dataset_path = keras.utils.get_file(
+    "aclImdb_v1.tar.gz", data_url, untar=True, cache_dir=".", cache_subdir=""
+)
+main_dir = os.path.join(dataset_path, "aclImdb")
+train_dir = os.path.join(main_dir, "train")
+test_dir = os.path.join(main_dir, "test")
+
+# The IMDB dataset includes an 'unsup' (unsupervised) directory in train that should be ignored
+remove_dir = os.path.join(train_dir, "unsup")
+if os.path.exists(remove_dir):
+    shutil.rmtree(remove_dir)
+
+
+# Helper function to load the data from the extracted files into a DataFrame
+def load_data_from_dir(directory):
+    """Loads text reviews and their labels from a directory."""
+    reviews, sentiments = [], []
+    for sentiment_type, sentiment_label in [("pos", 1), ("neg", 0)]:
+        sentiment_dir = os.path.join(directory, sentiment_type)
+        for fname in os.listdir(sentiment_dir):
+            if fname.endswith(".txt"):
+                # Use a standard, safe encoding
+                with open(os.path.join(sentiment_dir, fname), encoding="utf-8") as f:
+                    reviews.append(f.read())
+                    sentiments.append(sentiment_label)
+    return pd.DataFrame({"review": reviews, "sentiment": sentiments})
+
+# Load dataframes directly
+train_df = load_data_from_dir(train_dir)
+test_df = load_data_from_dir(test_dir)
+
+# The data is already split into 25k train and 25k test reviews
+# Separate the features (X) and labels (Y)
+X_train_text = train_df["review"]
+Y_train = train_df["sentiment"]
+X_test_text = test_df["review"]
+Y_test = test_df["sentiment"]
+
+print(f"Training samples: {len(X_train_text)}, Test samples: {len(X_test_text)}")
+# 
+
+---
+
+## 🧠 Data Processing with TextVectorization
+
+# Hyperparameters for the TextVectorization layer
+max_features = 5000  # Only consider the top N words
+sequence_length = 200 # Pad/truncate all sequences to a fixed length
+embedding_dim = 128  # Size of the output vector for each word
+
+# 1. Create the TextVectorization layer
+vectorize_layer = TextVectorization(
+    max_tokens=max_features,
+    output_mode="int", # Outputs integer indices
+    output_sequence_length=sequence_length,
+)
+
+# 2. Adapt the layer to the training text
+# Adapt() builds the vocabulary based on the training data.
+# We convert the Pandas Series to a batched TensorFlow Dataset for efficient adaptation.
+text_ds = tf.data.Dataset.from_tensor_slices(X_train_text.values).batch(128)
+vectorize_layer.adapt(text_ds)
+
+# Optional: Inspect the vocabulary
+# print("Vocabulary size:", len(vectorize_layer.get_vocabulary()))
+# print("Top 10 words in vocabulary:", vectorize_layer.get_vocabulary()[:10])
+
+# The text and labels are now ready to be passed directly to model.fit
+
+---
+
+## 🏗️ LSTM Model (End-to-End)
+
+# The TextVectorization layer is included directly in the Sequential model,
+# making the model "end-to-end" (it accepts raw strings).
+
+model = Sequential([
+    # 1. Input: TextVectorization layer (accepts raw string, outputs integer sequences)
+    vectorize_layer, 
+    # 2. Embedding Layer: Maps integer indices to dense vectors
+    layers.Embedding(input_dim=max_features, output_dim=embedding_dim, mask_zero=True),
+    # 3. LSTM Layer: Recurrent layer for sequential processing
+    layers.LSTM(128, dropout=0.2),
+    # 4. Dense Output Layer: Binary classification with sigmoid activation
+    layers.Dense(1, activation="sigmoid")
+])
 
-Y_train = train_data["sentiment"]
-Y_test = test_data["sentiment"]
+model.summary()
+# 
 
-print(Y_train)
-print(Y_test)
+# Compile the model
+model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
 
+---
 
-"""
-## LSTM (Long Short Term Memory) Model
-"""
-# build the model
+## 🏃 Training and Evaluation
 
-model = Sequential()  #sequential model
-#add layers
-model.add(Embedding(input_dim=5000, output_dim=128, input_shape=(200,)))
-model.add(LSTM(128, dropout=0.2))
-model.add(Dense(1, activation="sigmoid"))
+print("\n## Training the Model")
+# The model can now be trained by passing the raw text and labels.
+model.fit(
+    X_train_text, # Raw text input
+    Y_train,      # Integer labels
+    epochs=5, 
+    batch_size=64, 
+    validation_split=0.2
+)
 
-model.summary()
+print("\n## Model Evaluation")
+# Note: For evaluation, use the raw text from the test set
+loss, accuracy = model.evaluate(X_test_text, Y_test) 
+print(f"Test Loss: {loss:.4f}")
+print(f"Test Accuracy: {accuracy:.4f}")
 
-# compile the model
-model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
-"""
-## Training the Model
-"""
-model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)
+---
 
-"""
-## Model Evaluation
-"""
-loss, accuracy = model.evaluate(X_test, Y_test)
-print(f"Test Loss: {loss}")
-print(f"Test Accuracy: {accuracy}")
+## 🔮 Predicting Values (Simplified Inference)
 
-"""
-### Predicting Values
-"""
 def predict_sentiment(review):
-  # tokenize and pad the review
-  sequence = tokenizer.texts_to_sequences([review])
-  padded_sequence = pad_sequences(sequence, maxlen=200)
-  prediction = model.predict(padded_sequence)
-  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
-  return sentiment
-
-
-#  examples
-
-new_review = "This movie was fantastic. I loved it."
-sentiment = predict_sentiment(new_review)
-print(f"The sentiment of the review is: {sentiment}")
-#===================================================================================#
-new_review = "This movie was not that good"
-sentiment = predict_sentiment(new_review)
-print(f"The sentiment of the review is: {sentiment}")
-#===================================================================================#
-new_review = "Great movie but could have added a better action scene"
-sentiment = predict_sentiment(new_review)
-print(f"The sentiment of the review is: {sentiment}")
-#===================================================================================#
-new_review = "Mid movie"
-sentiment = predict_sentiment(new_review)
-print(f"The sentiment of the review is: {sentiment}")
-# ==================================================================================#
-new_review = "I laughing while shitting damn what a watch"
-sentiment = predict_sentiment(new_review)
-print(f"The sentiment of the review is: {sentiment}")
\ No newline at end of file
+    """Predicts sentiment for a raw text review using the end-to-end model."""
+    # The model accepts a list/array of raw strings directly
+    prediction = model.predict([review])
+    
+    # Sigmoid output is a probability
+    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
+    probability = prediction[0][0]
+    return sentiment, probability
+
+# Examples
+print("\n### Predicting Values")
+examples = [
+    "This movie was fantastic. I loved it.",
+    "This movie was not that good",
+    "Great movie but could have added a better action scene",
+    "Mid movie"
+]
+
+for review in examples:
+    sentiment, prob = predict_sentiment(review)
+    print(f"Review: '{review[:30]}...' -> Sentiment: {sentiment} ({prob:.2f})")
+
+# Clean up the downloaded directory
+if os.path.exists(main_dir):
+    shutil.rmtree(main_dir)
\ No newline at end of file

From 8aec99562a7ac40f4cbb9039a0aad168a1862d68 Mon Sep 17 00:00:00 2001
From: Madhur <madhurjain.kota@gmail.com>
Date: Tue, 2 Dec 2025 19:18:25 +0530
Subject: [PATCH 3/3] correcting suggested changes

---
 examples/nlp/imdb_lstm_sentiment.py | 30 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/examples/nlp/imdb_lstm_sentiment.py b/examples/nlp/imdb_lstm_sentiment.py
index 68a70f314c..550f727cd1 100644
--- a/examples/nlp/imdb_lstm_sentiment.py
+++ b/examples/nlp/imdb_lstm_sentiment.py
@@ -2,7 +2,7 @@
 Title: Sentiment analysis with LSTM on the IMDB dataset
 Author: Madhur Jain
 Date created: 2025/11/19
-Last Modified: 2025/11/24 (Refactored)
+Last Modified: 2025/12/02 
 Description: A simple LSTM-based sentiment classifier trained on IMDB text reviews,
              demonstrating the modern Keras TextVectorization layer.
 """
@@ -14,7 +14,6 @@
 import pandas as pd
 from keras import layers
 from keras.models import Sequential
-from keras.layers import TextVectorization # Modern Keras text preprocessing
 
 # Set the Keras backend
 os.environ["KERAS_BACKEND"] = "tensorflow"
@@ -30,7 +29,7 @@
 dataset_path = keras.utils.get_file(
     "aclImdb_v1.tar.gz", data_url, untar=True, cache_dir=".", cache_subdir=""
 )
-main_dir = os.path.join(dataset_path, "aclImdb")
+main_dir = os.path.join(os.path.dirname(dataset_path), "aclImdb")
 train_dir = os.path.join(main_dir, "train")
 test_dir = os.path.join(main_dir, "test")
 
@@ -78,7 +77,7 @@ def load_data_from_dir(directory):
 embedding_dim = 128  # Size of the output vector for each word
 
 # 1. Create the TextVectorization layer
-vectorize_layer = TextVectorization(
+vectorize_layer = layers.TextVectorization(
     max_tokens=max_features,
     output_mode="int", # Outputs integer indices
     output_sequence_length=sequence_length,
@@ -140,18 +139,18 @@ def load_data_from_dir(directory):
 print(f"Test Loss: {loss:.4f}")
 print(f"Test Accuracy: {accuracy:.4f}")
 
----
-
 ## 🔮 Predicting Values (Simplified Inference)
 
+# We use this function for single, quick predictions if needed, 
+# but the batch method below is preferred for multiple examples.
 def predict_sentiment(review):
     """Predicts sentiment for a raw text review using the end-to-end model."""
     # The model accepts a list/array of raw strings directly
-    prediction = model.predict([review])
+    prediction = model.predict([review]) 
     
     # Sigmoid output is a probability
-    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
     probability = prediction[0][0]
+    sentiment = "positive" if probability > 0.5 else "negative"
     return sentiment, probability
 
 # Examples
@@ -163,10 +162,17 @@ def predict_sentiment(review):
     "Mid movie"
 ]
 
-for review in examples:
-    sentiment, prob = predict_sentiment(review)
+# Predict on the batch of examples for efficiency
+predictions = model.predict(examples)
+
+for review, prediction in zip(examples, predictions):
+    prob = prediction[0]
+    sentiment = "positive" if prob > 0.5 else "negative"
     print(f"Review: '{review[:30]}...' -> Sentiment: {sentiment} ({prob:.2f})")
 
-# Clean up the downloaded directory
+# Clean up the downloaded directory and archive
 if os.path.exists(main_dir):
-    shutil.rmtree(main_dir)
\ No newline at end of file
+    shutil.rmtree(main_dir)
+# This removes the downloaded archive file (e.g., aclImdb_v1.tar.gz)
+if os.path.exists(dataset_path):
+    os.remove(dataset_path)
\ No newline at end of file