grc-cohort-21 · elena5100 · Jan 21, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/README.md b/README.md
@@ -110,4 +110,6 @@ Consider doing any of the following (some are very hard!):
 ## Submitting
 Submit your project by making a PR and copying the link to the canvas assignment.
 
-TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED.
+TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED.
+
+shams
diff --git a/oscarWildeTraining.txt b/oscarWildeTraining.txt
diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java
@@ -1,3 +1,6 @@
+
+
+    import java.util.ArrayList;
 import java.util.List;
 import java.util.Scanner;
 
@@ -28,9 +31,40 @@ public class LowercaseSentenceTokenizer implements Tokenizer {
    * @param scanner the Scanner to read the input text from
    * @return a list of tokens, where each token is a word or a period
    */
-  public List<String> tokenize(Scanner scanner) {
-    // TODO: Implement this function to convert the scanner's input to a list of words and periods
-    return null;
-  }
+    /* TODO: Implement this function to convert the scanner's input to a list of words and periods
+    return null; */
+
+
+@Override
+    public List<String> tokenize(Scanner scanner) {
+        List<String> tokens = new ArrayList<>();
+
+        String input = scanner.nextLine().toLowerCase();
+
+        // Use "\\s+" to split by any number of spaces (removes empty tokens)
+        String[] words = input.split("\\s+"); 
+
+        for (String word : words) {
+            if (word.endsWith(".")) {
+              // I ues the substring in this website https://www.w3schools.com/jsref/jsref_substring.asp
+              /* explain what this code do...
+              This code reads a sentence, converts it to lowercase, and splits it into words. It then checks
+if any word ends with a period (.). If there is a period at the end of a word, the period is
+separated and stored as its own token. Anything without a period gets directly added to the list. Finally,
+This function separates the words from the period as different tokens in a list.
+               */
+              String withoutPeriod = word.substring(0, word.length() - 1);
+                if (!withoutPeriod.isEmpty()) {
+                    tokens.add(withoutPeriod); // Add the word without the period
+                }
+                tokens.add("."); // Add the period as a separate token
+            } else {
+                tokens.add(word); 
+            }
+        }
+
+        return tokens;
+    }
 }
 
+
diff --git a/src/LowercaseSentenceTokenizerTest.java b/src/LowercaseSentenceTokenizerTest.java
@@ -20,6 +20,16 @@ void testTokenizeWithNoCapitalizationOrPeriod() {
      * Write your test here!
      */
 
+@Test
+void testTokenizeWithExtraSpaces() {
+    LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer();
+    Scanner scanner = new Scanner("hello     hi hi hi    hello hello");
+    List<String> tokens = tokenizer.tokenize(scanner);
+
+    assertEquals(List.of("hello", "hi", "hi", "hi", "hello", "hello"), tokens);
+}
+
+
 
     // Wave 3
     @Test

diff --git a/src/UnigramWordPredictor.java b/src/UnigramWordPredictor.java
@@ -2,6 +2,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 import java.util.Scanner;
 
 /**
@@ -10,8 +11,9 @@
  * words that directly follow it in the text.
  */
 public class UnigramWordPredictor implements WordPredictor {
-  private Map<String, List<String>> neighborMap;
+  private Map<String, List<String>> neighborMap; 
   private Tokenizer tokenizer;
+  private Random random; 
 
   /**
    * Constructs a UnigramWordPredictor with the specified tokenizer.
@@ -20,6 +22,8 @@ public class UnigramWordPredictor implements WordPredictor {
    */
   public UnigramWordPredictor(Tokenizer tokenizer) {
     this.tokenizer = tokenizer;
+    this.neighborMap = new HashMap<>(); 
+    this.random = new Random(); 
   }
 
   /**
@@ -29,29 +33,27 @@ public UnigramWordPredictor(Tokenizer tokenizer) {
    * in the text. The resultant map is stored in the neighborMap
    * instance variable.
    * 
-   * For example:
-   * If the input text is: "The cat sat. The cat slept. The dog barked."
-   * After tokenizing, the tokens would be: ["the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", "."]
-   * 
-   * The resulting map (neighborMap) would be:
-   * {
-   *   "the" -> ["cat", "cat", "dog"],
-   *   "cat" -> ["sat", "slept"],
-   *   "sat" -> ["."],
-   *   "." -> ["the", "the"],
-   *   "slept" -> ["."],
-   *   "dog" -> ["barked"],
-   *   "barked" -> ["."]
-   * }
-   * 
-   * The order of the map and the order of each list is not important.
-   * 
    * @param scanner the Scanner to read the training text from
    */
   public void train(Scanner scanner) {
     List<String> trainingWords = tokenizer.tokenize(scanner);
 
-    // TODO: Convert the trainingWords into neighborMap here
+    if (trainingWords.isEmpty()) {
+        return;
+    }
+
+    for (int i = 0; i < trainingWords.size() - 1; i++) {
+      String currentWord = trainingWords.get(i);
+      String nextWord = trainingWords.get(i + 1);
+
+      // this code mean ...if currentWord is not already in neighborMap, add it with an empty list as its value.
+
+
+      neighborMap.putIfAbsent(currentWord, new ArrayList<>());
+
+      // Store the word that follows
+      neighborMap.get(currentWord).add(nextWord);
+    }
   }
 
   /**
@@ -98,18 +100,37 @@ public void train(Scanner scanner) {
    * @param context a list of words representing the current context
    * @return the predicted next word, or null if no prediction can be made
    */
+
+   /* explain the code what do ..The predictNextWord method takes a list of
+    words as context and predicts the next word. It first checks if the context is empty and
+     returns null if so. Then, it retrieves the last word from the list and looks it up in neighborMap, 
+     which stores words and their possible next words from training data. If the last word exists in the map 
+     and has a list of next words, it randomly selects one based on frequency. If no match is found, it returns
+      null.
+ */
+/* I ues this link to learn on this https://stackoverflow.com/questions/4672806/java-simplest-way-to-get-last-word-in-a-string */
   public String predictNextWord(List<String> context) {
-    // TODO: Return a predicted word given the words preceding it
-    // Hint: only the last word in context should be looked at
-    return null;
+    if (context.isEmpty()) {
+        return null;
+    }
+
+    String lastWord = context.get(context.size() - 1); // Get last word
+
+    if (neighborMap.containsKey(lastWord)) {
+        List<String> nextWords = neighborMap.get(lastWord);
+
+        if (!nextWords.isEmpty()) {
+            return nextWords.get(random.nextInt(nextWords.size())); 
+        }
+    }
+
+    return null; 
   }
-  
+
   /**
    * Returns a copy of the neighbor map. The neighbor map is a mapping 
    * from each word to a list of words that have followed it in the training data.
    * 
-   * You do not need to modify this method for your project.
-   * 
    * @return a copy of the neighbor map
    */
   public Map<String, List<String>> getNeighborMap() {

diff --git a/src/UnigramWordPredictorTest.java b/src/UnigramWordPredictorTest.java
@@ -5,8 +5,7 @@
 import static org.junit.jupiter.api.Assertions.*;
 
 class UnigramWordPredictorTest {
-
-    // Wave 4
+// Wave 4
     /**
      * Tests the train method by checking that the generated neighbor map matches the expected map.
      * 
@@ -26,21 +25,21 @@ class UnigramWordPredictorTest {
      */
     @Test
     void testTrainAndGetNeighborMap() {
-        // Use a fake tokenizer with predefined tokens
+        // Simulated training text with tokenized words
         FakeTokenizer fakeTokenizer = new FakeTokenizer(
             List.of("the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", ".")
         );
+
         UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);
-
-        predictor.train(null); // The scanner input is ignored by FakeTokenizer
+        predictor.train(null); // Scanner input is ignored by FakeTokenizer
+
         Map<String, List<String>> neighborMap = predictor.getNeighborMap();
 
-        // Sort the actual lists to ensure order does not affect comparison
         for (List<String> values : neighborMap.values()) {
-            values.sort(null); // Sort alphabetically
+            values.sort(null);
         }
-
         // Pre-sorted expected map
+
         Map<String, List<String>> expectedMap = Map.of(
             "the", List.of("cat", "cat", "dog"),
             "cat", List.of("sat", "slept"),
@@ -74,45 +73,26 @@ void testTrainAndGetNeighborMap() {
      * 
      * The test verifies that the predictions for various words are consistent with these patterns.
      */
+
     @Test
     void testPredictNextWord() {
-        // Use a fake tokenizer with a new set of predefined tokens
+                // Use a fake tokenizer with a new set of predefined tokens
+
         FakeTokenizer fakeTokenizer = new FakeTokenizer(
             List.of("the", "quick", "brown", "fox", ".", "a", "quick", "red", "fox", ".", "the", "slow", "green", "turtle", ".")
         );
+
         UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);
-
-        predictor.train(null); // The scanner input is ignored by FakeTokenizer
-
-        // Predicting the next word after "the" should be "quick" or "slow"
-        String nextWord = predictor.predictNextWord(List.of("the"));
-        assertTrue(nextWord.equals("quick") || nextWord.equals("slow"));
-
-        // Predicting the next word after "a" should be "quick"
-        nextWord = predictor.predictNextWord(List.of("a"));
-        assertEquals("quick", nextWord);
-
-        // Predicting the next word after "quick" should be either "brown" or "red"
-        nextWord = predictor.predictNextWord(List.of("quick"));
-        assertTrue(nextWord.equals("brown") || nextWord.equals("red"));
-
-        // Predicting the next word after "fox" should always be "."
-        nextWord = predictor.predictNextWord(List.of("fox"));
-        assertEquals(".", nextWord);
-
-        // Predicting the next word after "slow" should always be "green"
-        nextWord = predictor.predictNextWord(List.of("slow"));
-        assertEquals("green", nextWord);
-
-        // Predicting the next word after "turtle" should always be "."
-        nextWord = predictor.predictNextWord(List.of("turtle"));
-        assertEquals(".", nextWord);
-
-        // Predicting the next word after "." should be "the" or "a"
-        nextWord = predictor.predictNextWord(List.of("."));
-        assertTrue(nextWord.equals("the") || nextWord.equals("a"));
-    }
+        predictor.train(null);
 
+        assertTrue(List.of("quick", "slow").contains(predictor.predictNextWord(List.of("the"))));
+        assertEquals("quick", predictor.predictNextWord(List.of("a")));
+        assertTrue(List.of("brown", "red").contains(predictor.predictNextWord(List.of("quick"))));
+        assertEquals(".", predictor.predictNextWord(List.of("fox")));
+        assertEquals("green", predictor.predictNextWord(List.of("slow")));
+        assertEquals(".", predictor.predictNextWord(List.of("turtle")));
+        assertTrue(List.of("the", "a").contains(predictor.predictNextWord(List.of("."))));
+    }
 
     // Wave 5
     /**
@@ -132,50 +112,43 @@ void testPredictNextWord() {
      * The test runs multiple trials to estimate these probabilities and compares them to the
      * expected values with some tolerance for variation.
      */
+
     @Test
     void testPredictNextWordProbabilistically() {
-        // Use a fake tokenizer with predefined tokens
         FakeTokenizer fakeTokenizer = new FakeTokenizer(
             List.of("the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", ".")
         );
+
         UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);
-
-        predictor.train(null); // The scanner input is ignored by FakeTokenizer
+        predictor.train(null);
 
-        // Perform multiple trials to check word frequencies
-        int trials = 10000; // Number of trials for statistical testing
-        double tolerance = 0.05; // Tolerance for frequency comparison
+        // Number of trials for probability check
+        int trials = 10000;
+        double tolerance = 0.05; 
 
-        // Expected probabilities
-        Map<String, Map<String, Double>> expectedProbabilities = new HashMap<>();
-        expectedProbabilities.put("the", Map.of("cat", 2.0 / 3.0, "dog", 1.0 / 3.0));
-        expectedProbabilities.put("cat", Map.of("sat", 0.5, "slept", 0.5));
-        expectedProbabilities.put("dog", Map.of("barked", 1.0));
-        expectedProbabilities.put(".", Map.of("the", 1.0));
+        Map<String, Map<String, Double>> expectedProbabilities = Map.of(
+            "the", Map.of("cat", 2.0 / 3.0, "dog", 1.0 / 3.0),
+            "cat", Map.of("sat", 0.5, "slept", 0.5),
+            "dog", Map.of("barked", 1.0),
+            ".", Map.of("the", 1.0)
+        );
 
-        // Run trials for each test case
         for (String word : expectedProbabilities.keySet()) {
             Map<String, Integer> counts = new HashMap<>();
             Map<String, Double> expected = expectedProbabilities.get(word);
 
-            // Initialize counts for expected words
-            for (String nextWord : expected.keySet()) {
-                counts.put(nextWord, 0);
-            }
+            expected.keySet().forEach(nextWord -> counts.put(nextWord, 0));
 
-            // Perform trials
             for (int i = 0; i < trials; i++) {
                 String predictedWord = predictor.predictNextWord(List.of(word));
                 counts.put(predictedWord, counts.getOrDefault(predictedWord, 0) + 1);
             }
 
-            // Check frequencies
             for (String nextWord : expected.keySet()) {
                 double observedFrequency = counts.get(nextWord) / (double) trials;
                 double expectedFrequency = expected.get(nextWord);
                 assertTrue(Math.abs(observedFrequency - expectedFrequency) < tolerance,
-                        "Observed frequency of '" + nextWord + "' after '" + word +
-                        "' was " + observedFrequency + ", expected " + expectedFrequency);
+                        "Frequency mismatch for '" + word + "': observed " + observedFrequency + ", expected " + expectedFrequency);
             }
         }
     }

diff --git a/src/WordPredictor.java b/src/WordPredictor.java
@@ -23,4 +23,4 @@ public interface WordPredictor {
    * @return the predicted next word, or null if no prediction can be made
    */
   public String predictNextWord(List<String> context);
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,4 +23,4 @@ public interface WordPredictor { @@
        * @return the predicted next word, or null if no prediction can be made
        */
       public String predictNextWord(List<String> context);
-    }
+    }