Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,6 @@ Consider doing any of the following (some are very hard!):
## Submitting
Submit your project by making a PR and copying the link to the canvas assignment.

TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED.
TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED.

shams
195 changes: 195 additions & 0 deletions oscarWildeTraining.txt

Large diffs are not rendered by default.

42 changes: 38 additions & 4 deletions src/LowercaseSentenceTokenizer.java
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@


import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

Expand Down Expand Up @@ -28,9 +31,40 @@ public class LowercaseSentenceTokenizer implements Tokenizer {
* @param scanner the Scanner to read the input text from
* @return a list of tokens, where each token is a word or a period
*/
public List<String> tokenize(Scanner scanner) {
// TODO: Implement this function to convert the scanner's input to a list of words and periods
return null;
}
/* TODO: Implement this function to convert the scanner's input to a list of words and periods
return null; */


@Override
public List<String> tokenize(Scanner scanner) {
List<String> tokens = new ArrayList<>();

String input = scanner.nextLine().toLowerCase();
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this only reads in a single line of input and doesn't handle multi-line input.


// Use "\\s+" to split by any number of spaces (removes empty tokens)
String[] words = input.split("\\s+");

for (String word : words) {
if (word.endsWith(".")) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Smart!

// I ues the substring in this website https://www.w3schools.com/jsref/jsref_substring.asp
/* explain what this code do...
This code reads a sentence, converts it to lowercase, and splits it into words. It then checks
if any word ends with a period (.). If there is a period at the end of a word, the period is
separated and stored as its own token. Anything without a period gets directly added to the list. Finally,
This function separates the words from the period as different tokens in a list.
*/
String withoutPeriod = word.substring(0, word.length() - 1);
if (!withoutPeriod.isEmpty()) {
tokens.add(withoutPeriod); // Add the word without the period
}
tokens.add("."); // Add the period as a separate token
} else {
tokens.add(word);
}
}

return tokens;
}
}


10 changes: 10 additions & 0 deletions src/LowercaseSentenceTokenizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ void testTokenizeWithNoCapitalizationOrPeriod() {
* Write your test here!
*/

@Test
void testTokenizeWithExtraSpaces() {
LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer();
Scanner scanner = new Scanner("hello hi hi hi hello hello");
List<String> tokens = tokenizer.tokenize(scanner);

assertEquals(List.of("hello", "hi", "hi", "hi", "hello", "hello"), tokens);
}
Comment on lines +23 to +30
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!




// Wave 3
@Test
Expand Down
71 changes: 46 additions & 25 deletions src/UnigramWordPredictor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Scanner;

/**
Expand All @@ -10,8 +11,9 @@
* words that directly follow it in the text.
*/
public class UnigramWordPredictor implements WordPredictor {
private Map<String, List<String>> neighborMap;
private Map<String, List<String>> neighborMap;
private Tokenizer tokenizer;
private Random random;

/**
* Constructs a UnigramWordPredictor with the specified tokenizer.
Expand All @@ -20,6 +22,8 @@ public class UnigramWordPredictor implements WordPredictor {
*/
public UnigramWordPredictor(Tokenizer tokenizer) {
this.tokenizer = tokenizer;
this.neighborMap = new HashMap<>();
this.random = new Random();
}

/**
Expand All @@ -29,29 +33,27 @@ public UnigramWordPredictor(Tokenizer tokenizer) {
* in the text. The resultant map is stored in the neighborMap
* instance variable.
*
* For example:
* If the input text is: "The cat sat. The cat slept. The dog barked."
* After tokenizing, the tokens would be: ["the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", "."]
*
* The resulting map (neighborMap) would be:
* {
* "the" -> ["cat", "cat", "dog"],
* "cat" -> ["sat", "slept"],
* "sat" -> ["."],
* "." -> ["the", "the"],
* "slept" -> ["."],
* "dog" -> ["barked"],
* "barked" -> ["."]
* }
*
* The order of the map and the order of each list is not important.
*
* @param scanner the Scanner to read the training text from
*/
public void train(Scanner scanner) {
List<String> trainingWords = tokenizer.tokenize(scanner);

// TODO: Convert the trainingWords into neighborMap here
if (trainingWords.isEmpty()) {
return;
}

for (int i = 0; i < trainingWords.size() - 1; i++) {
String currentWord = trainingWords.get(i);
String nextWord = trainingWords.get(i + 1);

// this code mean ...if currentWord is not already in neighborMap, add it with an empty list as its value.


neighborMap.putIfAbsent(currentWord, new ArrayList<>());

// Store the word that follows
neighborMap.get(currentWord).add(nextWord);
}
Comment on lines +45 to +56
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

}

/**
Expand Down Expand Up @@ -98,18 +100,37 @@ public void train(Scanner scanner) {
* @param context a list of words representing the current context
* @return the predicted next word, or null if no prediction can be made
*/

/* explain the code what do ..The predictNextWord method takes a list of
words as context and predicts the next word. It first checks if the context is empty and
returns null if so. Then, it retrieves the last word from the list and looks it up in neighborMap,
which stores words and their possible next words from training data. If the last word exists in the map
and has a list of next words, it randomly selects one based on frequency. If no match is found, it returns
null.
*/
/* I ues this link to learn on this https://stackoverflow.com/questions/4672806/java-simplest-way-to-get-last-word-in-a-string */
public String predictNextWord(List<String> context) {
// TODO: Return a predicted word given the words preceding it
// Hint: only the last word in context should be looked at
return null;
if (context.isEmpty()) {
return null;
}

String lastWord = context.get(context.size() - 1); // Get last word

if (neighborMap.containsKey(lastWord)) {
List<String> nextWords = neighborMap.get(lastWord);

if (!nextWords.isEmpty()) {
return nextWords.get(random.nextInt(nextWords.size()));
}
}
Comment on lines +117 to +125
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice logic


return null;
}

/**
* Returns a copy of the neighbor map. The neighbor map is a mapping
* from each word to a list of words that have followed it in the training data.
*
* You do not need to modify this method for your project.
*
* @return a copy of the neighbor map
*/
public Map<String, List<String>> getNeighborMap() {
Expand Down
95 changes: 34 additions & 61 deletions src/UnigramWordPredictorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import static org.junit.jupiter.api.Assertions.*;

class UnigramWordPredictorTest {

// Wave 4
// Wave 4
/**
* Tests the train method by checking that the generated neighbor map matches the expected map.
*
Expand All @@ -26,21 +25,21 @@ class UnigramWordPredictorTest {
*/
@Test
void testTrainAndGetNeighborMap() {
// Use a fake tokenizer with predefined tokens
// Simulated training text with tokenized words
FakeTokenizer fakeTokenizer = new FakeTokenizer(
List.of("the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", ".")
);

UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);

predictor.train(null); // The scanner input is ignored by FakeTokenizer
predictor.train(null); // Scanner input is ignored by FakeTokenizer

Map<String, List<String>> neighborMap = predictor.getNeighborMap();

// Sort the actual lists to ensure order does not affect comparison
for (List<String> values : neighborMap.values()) {
values.sort(null); // Sort alphabetically
values.sort(null);
}

// Pre-sorted expected map

Map<String, List<String>> expectedMap = Map.of(
"the", List.of("cat", "cat", "dog"),
"cat", List.of("sat", "slept"),
Expand Down Expand Up @@ -74,45 +73,26 @@ void testTrainAndGetNeighborMap() {
*
* The test verifies that the predictions for various words are consistent with these patterns.
*/

@Test
void testPredictNextWord() {
// Use a fake tokenizer with a new set of predefined tokens
// Use a fake tokenizer with a new set of predefined tokens

FakeTokenizer fakeTokenizer = new FakeTokenizer(
List.of("the", "quick", "brown", "fox", ".", "a", "quick", "red", "fox", ".", "the", "slow", "green", "turtle", ".")
);

UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);

predictor.train(null); // The scanner input is ignored by FakeTokenizer

// Predicting the next word after "the" should be "quick" or "slow"
String nextWord = predictor.predictNextWord(List.of("the"));
assertTrue(nextWord.equals("quick") || nextWord.equals("slow"));

// Predicting the next word after "a" should be "quick"
nextWord = predictor.predictNextWord(List.of("a"));
assertEquals("quick", nextWord);

// Predicting the next word after "quick" should be either "brown" or "red"
nextWord = predictor.predictNextWord(List.of("quick"));
assertTrue(nextWord.equals("brown") || nextWord.equals("red"));

// Predicting the next word after "fox" should always be "."
nextWord = predictor.predictNextWord(List.of("fox"));
assertEquals(".", nextWord);

// Predicting the next word after "slow" should always be "green"
nextWord = predictor.predictNextWord(List.of("slow"));
assertEquals("green", nextWord);

// Predicting the next word after "turtle" should always be "."
nextWord = predictor.predictNextWord(List.of("turtle"));
assertEquals(".", nextWord);

// Predicting the next word after "." should be "the" or "a"
nextWord = predictor.predictNextWord(List.of("."));
assertTrue(nextWord.equals("the") || nextWord.equals("a"));
}
predictor.train(null);

assertTrue(List.of("quick", "slow").contains(predictor.predictNextWord(List.of("the"))));
assertEquals("quick", predictor.predictNextWord(List.of("a")));
assertTrue(List.of("brown", "red").contains(predictor.predictNextWord(List.of("quick"))));
assertEquals(".", predictor.predictNextWord(List.of("fox")));
assertEquals("green", predictor.predictNextWord(List.of("slow")));
assertEquals(".", predictor.predictNextWord(List.of("turtle")));
assertTrue(List.of("the", "a").contains(predictor.predictNextWord(List.of("."))));
}

// Wave 5
/**
Expand All @@ -132,50 +112,43 @@ void testPredictNextWord() {
* The test runs multiple trials to estimate these probabilities and compares them to the
* expected values with some tolerance for variation.
*/

@Test
void testPredictNextWordProbabilistically() {
// Use a fake tokenizer with predefined tokens
FakeTokenizer fakeTokenizer = new FakeTokenizer(
List.of("the", "cat", "sat", ".", "the", "cat", "slept", ".", "the", "dog", "barked", ".")
);

UnigramWordPredictor predictor = new UnigramWordPredictor(fakeTokenizer);

predictor.train(null); // The scanner input is ignored by FakeTokenizer
predictor.train(null);

// Perform multiple trials to check word frequencies
int trials = 10000; // Number of trials for statistical testing
double tolerance = 0.05; // Tolerance for frequency comparison
// Number of trials for probability check
int trials = 10000;
double tolerance = 0.05;

// Expected probabilities
Map<String, Map<String, Double>> expectedProbabilities = new HashMap<>();
expectedProbabilities.put("the", Map.of("cat", 2.0 / 3.0, "dog", 1.0 / 3.0));
expectedProbabilities.put("cat", Map.of("sat", 0.5, "slept", 0.5));
expectedProbabilities.put("dog", Map.of("barked", 1.0));
expectedProbabilities.put(".", Map.of("the", 1.0));
Map<String, Map<String, Double>> expectedProbabilities = Map.of(
"the", Map.of("cat", 2.0 / 3.0, "dog", 1.0 / 3.0),
"cat", Map.of("sat", 0.5, "slept", 0.5),
"dog", Map.of("barked", 1.0),
".", Map.of("the", 1.0)
);

// Run trials for each test case
for (String word : expectedProbabilities.keySet()) {
Map<String, Integer> counts = new HashMap<>();
Map<String, Double> expected = expectedProbabilities.get(word);

// Initialize counts for expected words
for (String nextWord : expected.keySet()) {
counts.put(nextWord, 0);
}
expected.keySet().forEach(nextWord -> counts.put(nextWord, 0));

// Perform trials
for (int i = 0; i < trials; i++) {
String predictedWord = predictor.predictNextWord(List.of(word));
counts.put(predictedWord, counts.getOrDefault(predictedWord, 0) + 1);
}

// Check frequencies
for (String nextWord : expected.keySet()) {
double observedFrequency = counts.get(nextWord) / (double) trials;
double expectedFrequency = expected.get(nextWord);
assertTrue(Math.abs(observedFrequency - expectedFrequency) < tolerance,
"Observed frequency of '" + nextWord + "' after '" + word +
"' was " + observedFrequency + ", expected " + expectedFrequency);
"Frequency mismatch for '" + word + "': observed " + observedFrequency + ", expected " + expectedFrequency);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/WordPredictor.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ public interface WordPredictor {
* @return the predicted next word, or null if no prediction can be made
*/
public String predictNextWord(List<String> context);
}
}