From 3187f30aa6cd930411dc17e32627884ad20b26ea Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:48:28 -0800 Subject: [PATCH 01/15] added text to readme for git validation --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e57375e..0ff8656 100644 --- a/README.md +++ b/README.md @@ -110,4 +110,7 @@ Consider doing any of the following (some are very hard!): ## Submitting Submit your project by making a PR and copying the link to the canvas assignment. -TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED. \ No newline at end of file +TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED. + + +git validation test \ No newline at end of file From f45519407bf952360eb03c32c5c84b43493dbab5 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Thu, 23 Jan 2025 22:05:20 -0800 Subject: [PATCH 02/15] completed Wave1 --- README.md | 7 ++----- src/LowercaseSentenceTokenizer.java | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0ff8656..e1efdec 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ This is a large, difficult project. Start early, and get help when you need it. Sometimes this button takes a little bit to show up when you first open VS Code. If you're not seeing it, make sure you have the Java extension pack installed and it is active. 1. It should ask you for a filename. Give it the following filename: ``` - wikipediaData.txt + keatsTraining.txt ``` Then hit enter. 1. It should ask you for a number of words. Enter a positive integer and hit enter. @@ -110,7 +110,4 @@ Consider doing any of the following (some are very hard!): ## Submitting Submit your project by making a PR and copying the link to the canvas assignment. -TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED. - - -git validation test \ No newline at end of file +TURN SOMETHING IN BY THE DUE DATE EVEN IF YOU'RE NOT FINISHED. \ No newline at end of file diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index cc8285d..623375e 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -1,3 +1,4 @@ +import java.util.ArrayList; import java.util.List; import java.util.Scanner; @@ -28,9 +29,18 @@ public class LowercaseSentenceTokenizer implements Tokenizer { * @param scanner the Scanner to read the input text from * @return a list of tokens, where each token is a word or a period */ - public List tokenize(Scanner scanner) { + public List tokenize(Scanner scanner) + { // TODO: Implement this function to convert the scanner's input to a list of words and periods - return null; + List tokenList = new ArrayList<>(); + + while(scanner.hasNext()) + { + String token = scanner.next().toLowerCase(); + tokenList.add(token); + } + + return tokenList; } } From e2d4f5c08034d6f666e2044db2f65ec3e10008db Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Thu, 23 Jan 2025 22:11:28 -0800 Subject: [PATCH 03/15] completed Wave2 --- src/LowercaseSentenceTokenizerTest.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/LowercaseSentenceTokenizerTest.java b/src/LowercaseSentenceTokenizerTest.java index 85ac3a2..0db187b 100644 --- a/src/LowercaseSentenceTokenizerTest.java +++ b/src/LowercaseSentenceTokenizerTest.java @@ -15,10 +15,16 @@ void testTokenizeWithNoCapitalizationOrPeriod() { assertEquals(List.of("this", "is", "a", "lowercase", "sentence", "without", "a", "period"), tokens); } + // Wave 2 - /* - * Write your test here! - */ + @Test + void testTokenizeWithManySpaces() { + LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer(); + Scanner scanner = new Scanner(" this is a sentence with lots of extra spaces"); + List tokens = tokenizer.tokenize(scanner); + + assertEquals(List.of("this", "is", "a", "sentence", "with", "lots", "of", "extra", "spaces"), tokens); + } // Wave 3 From 0b9d9f3429d25ebdae022a5b0e13411acd89ddcc Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Thu, 23 Jan 2025 22:35:02 -0800 Subject: [PATCH 04/15] completed Wave3 --- src/LowercaseSentenceTokenizer.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index 623375e..0dae82c 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -37,7 +37,18 @@ public List tokenize(Scanner scanner) while(scanner.hasNext()) { String token = scanner.next().toLowerCase(); - tokenList.add(token); + + if(token.endsWith(".")) + { + token = token.substring(0,token.length()-1); + tokenList.add(token); + tokenList.add("."); + } + + else + { + tokenList.add(token); + } } return tokenList; From 959ab86beece8c79152d3c01351cf33d703f629f Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Fri, 24 Jan 2025 00:02:08 -0800 Subject: [PATCH 05/15] completed Wave4 --- src/UnigramWordPredictor.java | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/UnigramWordPredictor.java b/src/UnigramWordPredictor.java index d713250..7401650 100644 --- a/src/UnigramWordPredictor.java +++ b/src/UnigramWordPredictor.java @@ -48,10 +48,42 @@ public UnigramWordPredictor(Tokenizer tokenizer) { * * @param scanner the Scanner to read the training text from */ - public void train(Scanner scanner) { + public void train(Scanner scanner) + { List trainingWords = tokenizer.tokenize(scanner); // TODO: Convert the trainingWords into neighborMap here + + neighborMap = new HashMap<>(); + + for (int i = 0; i < trainingWords.size() - 1; i++) + { + String word = trainingWords.get(i); + String nextWord = trainingWords.get(i + 1); + + if (neighborMap.containsKey(word)) + { + Listtemp = neighborMap.get(word); + temp.add(nextWord); + neighborMap.put(word, temp); + } + + else + { + Listtemp = new ArrayList<>(); + temp.add(nextWord); + neighborMap.put(word, temp); + } + } + + String lastWord = trainingWords.get(trainingWords.size()- 1); + + if (!neighborMap.containsKey(lastWord)) + { + List temp = new ArrayList<>(); + neighborMap.put(lastWord, temp); + } + } /** From cd6c063c3a2686b0d7024da9a32ef8e77c576585 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Fri, 24 Jan 2025 22:44:25 -0800 Subject: [PATCH 06/15] completed Wave5 --- src/UnigramWordPredictor.java | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/UnigramWordPredictor.java b/src/UnigramWordPredictor.java index 7401650..24de14a 100644 --- a/src/UnigramWordPredictor.java +++ b/src/UnigramWordPredictor.java @@ -3,6 +3,7 @@ import java.util.List; import java.util.Map; import java.util.Scanner; +import java.util.Random; /** * A class for predicting the next word in a sequence using a unigram model. @@ -130,10 +131,25 @@ public void train(Scanner scanner) * @param context a list of words representing the current context * @return the predicted next word, or null if no prediction can be made */ - public String predictNextWord(List context) { + public String predictNextWord(List context) + { // TODO: Return a predicted word given the words preceding it // Hint: only the last word in context should be looked at + + String lastWordInContext = context.get(context.size()-1); + Random random = new Random(); + String nextWord = ""; + + if (neighborMap.containsKey(lastWordInContext)) + { + List temp = neighborMap.get(lastWordInContext); + int index = random.nextInt(temp.size()); + nextWord = temp.get(index); + return nextWord; + } + return null; + } /** From 4adae5b6bb146229aa5454828ab34f1008ed09c0 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Fri, 24 Jan 2025 23:31:52 -0800 Subject: [PATCH 07/15] completed Wave6 --- dylanTraining.txt | 48 +++++++++++++++++++++++++++++++++++++++++++ lincolnTraining.txt | 16 +++++++++++++++ ramblebotOutput.txt | 13 ++++++++++++ yodaTraining.txt | 50 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 127 insertions(+) create mode 100644 dylanTraining.txt create mode 100644 lincolnTraining.txt create mode 100644 ramblebotOutput.txt create mode 100644 yodaTraining.txt diff --git a/dylanTraining.txt b/dylanTraining.txt new file mode 100644 index 0000000..d3880ac --- /dev/null +++ b/dylanTraining.txt @@ -0,0 +1,48 @@ +Once upon a time you dressed so fine +Threw the bums a dime in your prime, didn't you? +People call say 'beware doll, you're bound to fall' +You thought they were all kidding you +You used to laugh about +Everybody that was hanging out +Now you don't talk so loud +Now you don't seem so proud +About having to be scrounging your next meal +How does it feel, how does it feel? +To be without a home +Like a complete unknown, like a rolling stone +Ahh you've gone to the finest schools, alright Miss Lonely +But you know you only used to get juiced in it +Nobody's ever taught you how to live out on the street +And now you're gonna have to get used to it +You say you never compromise +With the mystery tramp, but now you realize +He's not selling any alibis +As you stare into the vacuum of his eyes +And say do you want to make a deal? +How does it feel, how does it feel? +To be on your own, with no direction home +A complete unknown, like a rolling stone +Ah you never turned around to see the frowns +On the jugglers and the clowns when they all did tricks for you +You never understood that it ain't no good +You shouldn't let other people get your kicks for you +You used to ride on a chrome horse with your diplomat +Who carried on his shoulder a Siamese cat +Ain't it hard when you discovered that +He really wasn't where it's at +After he took from you everything he could steal +How does it feel, how does it feel? +To be on your own, with no direction home +Like a complete unknown, like a rolling stone +Ahh princess on a steeple and all the pretty people +They're all drinking, thinking that they've got it made +Exchanging all precious gifts +But you better take your diamond ring, you better pawn it babe +You used to be so amused +At Napoleon in rags and the language that he used +Go to him he calls you, you can't refuse +When you ain't got nothing, you got nothing to lose +You're invisible now, you've got no secrets to conceal +How does it feel, ah how does it feel? +To be on your own, with no direction home +Like a complete unknown, like a rolling stone \ No newline at end of file diff --git a/lincolnTraining.txt b/lincolnTraining.txt new file mode 100644 index 0000000..70ed1d5 --- /dev/null +++ b/lincolnTraining.txt @@ -0,0 +1,16 @@ +Four score and seven years ago our fathers brought forth on this continent, a new nation, +conceived in Liberty, and dedicated to the proposition that all men are created equal. +Now we are engaged in a great civil war, testing whether that nation, or any nation so +conceived and so dedicated, can long endure. We are met on a great battle-field of that war. +We have come to dedicate a portion of that field, as a final resting place for those who here +gave their lives that that nation might live. It is altogether fitting and proper that we should do this. + +But, in a larger sense, we can not dedicate—we can not consecrate—we can not hallow—this ground. +The brave men, living and dead, who struggled here, have consecrated it, far above our poor power +to add or detract. The world will little note, nor long remember what we say here, but it can never +forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work +which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to +the great task remaining before us—that from these honored dead we take increased devotion to that cause + for which they gave the last full measure of devotion—that we here highly resolve that these dead shall + not have died in vain—that this nation, under God, shall have a new birth of freedom—and that government + of the people, by the people, for the people, shall not perish from the earth. \ No newline at end of file diff --git a/ramblebotOutput.txt b/ramblebotOutput.txt new file mode 100644 index 0000000..bd2b836 --- /dev/null +++ b/ramblebotOutput.txt @@ -0,0 +1,13 @@ +Enter the filename: yodaTraining.txt +Enter the number of words to generate: 250 +do you? much to be . in number are we do you? much we find him you have learned . luminous beings are no longer certain that strong am . on what know you will . the force and a little more we cling to suffering . to lose . strong . if no winners only survivors . you are we not these things . control . adventure excitement a child is to see always . in a war has fallen . control you start down the shroud of a child is the force . if no mistake you have learned . on many of view . the dark side has . you can . always . the mind . save you must be . to return . difficult to bend fear to lose . you have learned . strong is . size matters not to return . control control you will see always in the force things you must your path you have learned save you it is the dark side fear to be . when nine hundred years old you must learn the dark side has . old you look at me by my young padawan . you will be for knowledge and defense never for attack . in motion is . a powerful ally it can . wars not these things you will know . in number are we discover how much we do or do not there is not these things you have learned . control + + +Enter the filename: lincolnTraining.txt +Enter the number of words to generate: 250 +four score and so dedicated, can never forget what they gave the people, by the proposition that field, as a portion of the unfinished work which they gave their lives that field, as a final resting place for us to dedicate a new birth of that government of the brave men, living and dead, who here to add or any nation might live . it is for the earth . but, in a great civil war, testing whether that field, as a final resting place for us the proposition that government of devotion?that we say here, but it is rather for us to that government of devotion?that we say here, but it is rather for us the brave men, living and so conceived in a great task remaining before us?that from the proposition that nation, under god, shall not dedicate?we can long remember what we should do this continent, a new birth of the people, for the living, rather, to the unfinished work which they gave their lives that field, as a great civil war, testing whether that that nation, or detract . we say here, have thus far above our poor power to add or detract . now we take increased devotion to dedicate a larger sense, we are created equal . but, in a new nation, conceived and dedicated here highly resolve that government of freedom?and that war . it is for the unfinished work which they did here dedicated here gave their lives that field, as a + + +Enter the filename: dylanTraining.txt +Enter the number of words to generate: 250 +once upon a siamese cat ain't got it feel? to it feel, ah you don't seem so loud now you ain't no secrets to fall' you better pawn it ain't got nothing, you say 'beware doll, you're gonna have to be on your own, with no direction home like a home like a chrome horse with no secrets to get your own, with no direction home like a complete unknown, like a chrome horse with no secrets to be on the jugglers and say you dressed so proud about everybody that it feel, how does it feel, ah how does it feel? to get juiced in your own, with your diamond ring, you know you better take your kicks for you say 'beware doll, you're invisible now, you've got nothing to get juiced in rags and the clowns when you say do you you realize he's not selling any alibis as you how does it feel, how does it ain't no direction home like a deal? how does it babe you want to him he really wasn't where it's at napoleon in rags and the frowns on your diamond ring, you everything he really wasn't where it's at napoleon in your kicks for you say do you used to conceal how does it feel? to be without a rolling stone ahh you've gone to fall' you everything he used to him he could steal how to him he could steal how does it babe you stare into the pretty people \ No newline at end of file diff --git a/yodaTraining.txt b/yodaTraining.txt new file mode 100644 index 0000000..f7edd0c --- /dev/null +++ b/yodaTraining.txt @@ -0,0 +1,50 @@ +Do or do not there is no try. +The greatest teacher failure is. +Train yourself to let go of everything you fear to lose. +Fear is the path to the dark side fear leads to anger anger leads to hate hate leads to suffering. +You must unlearn what you have learned. +In a dark place we find ourselves and a little more knowledge lights our way. +Patience you must have my young Padawan. +Difficult to see always in motion is the future. +Adventure excitement a Jedi craves not these things. +Luminous beings are we not this crude matter. +Named must your fear be before banish it you can. +Size matters not look at me judge me by my size, do you? +Much to learn you still have. +Control control you must learn control. +A Jedi uses the force for knowledge and defense never for attack. +Truly wonderful the mind of a child is. +Once you start down the dark path forever will it dominate your destiny. +Always pass on what you have learned. +Wars not make one great. +Attachment leads to jealousy the shadow of greed that is. +The dark side clouds everything impossible to see the future is. +A challenge lifelong it is not to bend fear into anger. +When you look at the dark side careful you must be for the dark side looks back. +To be Jedi is to face the truth and choose. +On many long journeys have I gone. And waited too for others to return. +Your path you must decide. +Many of the truths that we cling to depend on our point of view. +Smaller in number are we but larger in mind. +When nine hundred years old you reach look as good you will not. +A Jedi's strength flows from the force. +Ready are you what know you of ready? +Looking found someone you have eh? +Faith in your new apprentice misplaced may be. +The force will be with you always. +For my ally is the force and a powerful ally it is. +Strong is Vader. Mind what you have learned. Save you it can. +If no mistake you have made losing you are. A different game you should play. +The more we learn the more we discover how much we do not know. +To answer power with power the Jedi way this is not. +Mind what you have learned save you it can. +In the end cowards are those who follow the dark side. +Use your feelings Obi-Wan and find him you will. +You will know when you are calm at peace. +A Jedi uses the Force for knowledge and defense never for attack. +Through the Force things you will see. Other places. The future. the past. Old friends long gone. +No longer certain that one ever does win a war I am. +In war there are no winners only survivors. +Good relations with the Wookiees I have. +The shroud of the dark side has fallen. Begun the Clone War has. +Strong am I with the Force, but not that strong. \ No newline at end of file From 805b4ebe405b11a591bbce62e3d6b378ac745570 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sat, 25 Jan 2025 23:22:46 -0800 Subject: [PATCH 08/15] modified tokenize to handle additional forms of punctuation --- src/LowercaseSentenceTokenizer.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index 0dae82c..003f351 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -37,13 +37,22 @@ public List tokenize(Scanner scanner) while(scanner.hasNext()) { String token = scanner.next().toLowerCase(); + char lastChar = token.charAt(token.length()-1); + char firstChar = token.charAt(0); + String last = String.valueOf(lastChar); + String first = String.valueOf(firstChar); - if(token.endsWith(".")) + if(first.equals("\"")) { - token = token.substring(0,token.length()-1); - tokenList.add(token); - tokenList.add("."); - } + tokenList.add(first); + token = token.substring(1,token.length()); + } + + if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) + { + tokenList.add(token.substring(0,token.length()-1)); + tokenList.add(last); + } else { From 7978fd48c15a2ef96f7b7a5e1a99f3f3ca4d0633 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sat, 25 Jan 2025 23:43:06 -0800 Subject: [PATCH 09/15] modified tokenize to handle more punctuation --- src/LowercaseSentenceTokenizer.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index 003f351..9abd731 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -37,16 +37,23 @@ public List tokenize(Scanner scanner) while(scanner.hasNext()) { String token = scanner.next().toLowerCase(); - char lastChar = token.charAt(token.length()-1); char firstChar = token.charAt(0); - String last = String.valueOf(lastChar); String first = String.valueOf(firstChar); if(first.equals("\"")) { tokenList.add(first); token = token.substring(1,token.length()); - } + } + + if(token.endsWith("...")) + { + tokenList.add(token.substring(0,token.length()-3)); + tokenList.add("..."); + } + + char lastChar = token.charAt(token.length()-1); + String last = String.valueOf(lastChar); if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) { From 769704f0941112b29ba70c3fbee3d2a3328a1892 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sat, 25 Jan 2025 23:58:58 -0800 Subject: [PATCH 10/15] added test for words ending with ellipsis --- src/LowercaseSentenceTokenizerTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/LowercaseSentenceTokenizerTest.java b/src/LowercaseSentenceTokenizerTest.java index 0db187b..52c8dcf 100644 --- a/src/LowercaseSentenceTokenizerTest.java +++ b/src/LowercaseSentenceTokenizerTest.java @@ -56,5 +56,14 @@ void testTokenizeWithInternalPeriod() { assertEquals(List.of("hello", "world", ".", "this", "is", "dr.smith's", "example", "."), tokens); } + + @Test + void testTokenizeEndsWithElipsisi() { + LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer(); + Scanner scanner = new Scanner("Hello world... This is Dr.Smith's example..."); + List tokens = tokenizer.tokenize(scanner); + + assertEquals(List.of("hello", "world", "...", "this", "is", "dr.smith's", "example", "..."), tokens); + } } From 823c0863f7aedacd1a4fed41c03939454580c626 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sun, 26 Jan 2025 00:04:15 -0800 Subject: [PATCH 11/15] fixed bug in tokenize method identified by new test --- src/LowercaseSentenceTokenizer.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index 9abd731..edcb15a 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -39,7 +39,9 @@ public List tokenize(Scanner scanner) String token = scanner.next().toLowerCase(); char firstChar = token.charAt(0); String first = String.valueOf(firstChar); - + char lastChar = token.charAt(token.length()-1); + String last = String.valueOf(lastChar); + if(first.equals("\"")) { tokenList.add(first); @@ -48,14 +50,12 @@ public List tokenize(Scanner scanner) if(token.endsWith("...")) { - tokenList.add(token.substring(0,token.length()-3)); + token = token.substring(0,token.length()-3); + tokenList.add(token); tokenList.add("..."); } - - char lastChar = token.charAt(token.length()-1); - String last = String.valueOf(lastChar); - if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) + else if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) { tokenList.add(token.substring(0,token.length()-1)); tokenList.add(last); From 75dbd23e0dd90bc36d31f05a57122e97418c2fb0 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sun, 26 Jan 2025 00:13:57 -0800 Subject: [PATCH 12/15] added test for sentences that are in quotes --- src/LowercaseSentenceTokenizerTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/LowercaseSentenceTokenizerTest.java b/src/LowercaseSentenceTokenizerTest.java index 52c8dcf..6d8da2b 100644 --- a/src/LowercaseSentenceTokenizerTest.java +++ b/src/LowercaseSentenceTokenizerTest.java @@ -65,5 +65,14 @@ void testTokenizeEndsWithElipsisi() { assertEquals(List.of("hello", "world", "...", "this", "is", "dr.smith's", "example", "..."), tokens); } + + @Test + void testTokenizeSentenceInQuotes() { + LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer(); + Scanner scanner = new Scanner("\"Hello world. This is Dr.Smith's example.\""); + List tokens = tokenizer.tokenize(scanner); + + assertEquals(List.of("\"", "hello","world", ".", "this", "is", "dr.smith's", "example", ".","\""), tokens); + } } From f0551f579fa0e1f8cda6adc99acf72a6cc9f4393 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sun, 26 Jan 2025 00:18:15 -0800 Subject: [PATCH 13/15] fixed bug in tokenize method identified by new test --- src/LowercaseSentenceTokenizer.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index edcb15a..4ed69fc 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -54,6 +54,13 @@ public List tokenize(Scanner scanner) tokenList.add(token); tokenList.add("..."); } + else if(token.endsWith(".\"")) + { + token = token.substring(0,token.length()-2); + tokenList.add(token); + tokenList.add("."); + tokenList.add("\""); + } else if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) { From abc29efe3fcd908bf7de72bee602d113d46236db Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sun, 26 Jan 2025 00:32:58 -0800 Subject: [PATCH 14/15] added test for Strings that only contain punctuation marks --- src/LowercaseSentenceTokenizerTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/LowercaseSentenceTokenizerTest.java b/src/LowercaseSentenceTokenizerTest.java index 6d8da2b..3b9310e 100644 --- a/src/LowercaseSentenceTokenizerTest.java +++ b/src/LowercaseSentenceTokenizerTest.java @@ -74,5 +74,14 @@ void testTokenizeSentenceInQuotes() { assertEquals(List.of("\"", "hello","world", ".", "this", "is", "dr.smith's", "example", ".","\""), tokens); } + + @Test + void testTokenizeOnlyPunctuationMarks() { + LowercaseSentenceTokenizer tokenizer = new LowercaseSentenceTokenizer(); + Scanner scanner = new Scanner("\"!.!,?:;!?.!\""); + List tokens = tokenizer.tokenize(scanner); + + assertEquals(List.of("\"", "!",".", "!", ",", "?", ":", ";", "!","?",".","!","\""), tokens); + } } From cda49effa314c7eca12ad64d20550c799b3bf356 Mon Sep 17 00:00:00 2001 From: mlarsen <174657206+mlarsen-source@users.noreply.github.com> Date: Sun, 26 Jan 2025 01:55:20 -0800 Subject: [PATCH 15/15] added helper methods to check punctuation marks and revised tokenize method to fix bugs identified by last test --- src/LowercaseSentenceTokenizer.java | 88 ++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/src/LowercaseSentenceTokenizer.java b/src/LowercaseSentenceTokenizer.java index 4ed69fc..be22174 100644 --- a/src/LowercaseSentenceTokenizer.java +++ b/src/LowercaseSentenceTokenizer.java @@ -33,48 +33,106 @@ public List tokenize(Scanner scanner) { // TODO: Implement this function to convert the scanner's input to a list of words and periods List tokenList = new ArrayList<>(); - + while(scanner.hasNext()) { String token = scanner.next().toLowerCase(); - char firstChar = token.charAt(0); - String first = String.valueOf(firstChar); - char lastChar = token.charAt(token.length()-1); - String last = String.valueOf(lastChar); + + if (checkIfAllPunctuation(token)) + { + for (char character : token.toCharArray()) + { + String punctuation = String.valueOf(character); + tokenList.add(punctuation); + } + continue; + } - if(first.equals("\"")) + if(token.startsWith("\"")) { - tokenList.add(first); + tokenList.add("\""); token = token.substring(1,token.length()); } - + if(token.endsWith("...")) { token = token.substring(0,token.length()-3); tokenList.add(token); tokenList.add("..."); } - else if(token.endsWith(".\"")) + + else if (token.length() > 1 && token.endsWith("\"")) + { + char beforeQuote = token.charAt(token.length() - 2); + if (checkIfPunctuation(beforeQuote)) + { + token = token.substring(0, token.length() - 2); + tokenList.add(token); + String punctuation = String.valueOf(beforeQuote); + tokenList.add(punctuation); + tokenList.add("\""); + } + + else + { + token = token.substring(0, token.length() - 1); + tokenList.add(token); + tokenList.add("\""); + } + } + + else if(token.endsWith("\"")) { - token = token.substring(0,token.length()-2); + token = token.substring(0,token.length()-1); tokenList.add(token); - tokenList.add("."); tokenList.add("\""); } - else if(last.equals(".") || last.equals("!")|| last.equals("?") || last.equals(",") || last.equals(";") || last.equals(":") || last.equals("\"")) + else if(checkIfPunctuation(token.charAt(token.length() - 1))) { - tokenList.add(token.substring(0,token.length()-1)); + char lastChar = token.charAt(token.length() - 1); + token = token.substring(0, token.length() - 1); + tokenList.add(token); + String last = String.valueOf(lastChar); tokenList.add(last); - } + } + + else if (checkIfAllPunctuation(token)) + { + for (char character : token.toCharArray()) + { + String punctuation = String.valueOf(character); + tokenList.add(punctuation); + } + } else { tokenList.add(token); } } - return tokenList; } + + public boolean checkIfPunctuation(char character) + { + if(character == '.' || character == '!' || character == '?' || character == ',' || character == ';' || character == ':' || character == '\"') + { + return true; + } + + return false; + } + + public boolean checkIfAllPunctuation(String token) + { + for(char character : token.toCharArray()) + if(!checkIfPunctuation(character)) + { + return false; + } + + return true; + } }