diff --git a/README.md b/README.md index 8cce527..453bc3b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,27 @@ -# TextMining +### Long German Words (lange deutsche Wörter) -This is the base repo for the text mining and analysis project for Software Design at Olin College. +### What it Does +To English speakers, very long German words often look silly. This software +analyzes some of the works of Johann Wolfgang von Goethe. It uses the texts +from Project Gutenberg (in German) to find the three longest words in each +of several of his works. + +### How to Use It +Install pickle and requests libraries before using. Run +"textmining_requests.py" to get the text files from Project Gutenberg. +Then, run "textmining_processing.py" to find the longest German words! +They will print in the terminal. + +### Progress +This project was originally written in Week 6 of Software Design, and I +reworked it in Week 13. It is now written in an object-oriented style and +has some updated text mining processes. + +### Some long words found +- freundschaftliche - friendly +- Allerdurchlauchtigster - most gracious/serene +- unwahrscheinlichem - unlikely +- zusammenschrumpfte - shriveled up +- Freundschaftsbezeigungen - demonstrations of friendship +- durcheinandergeschüttelt - agitated +- Amtschreiberstochter - office clerk's daughter diff --git a/berlichingen.txt b/berlichingen.txt deleted file mode 100644 index 0c9abfc..0000000 Binary files a/berlichingen.txt and /dev/null differ diff --git a/classText.py b/classText.py new file mode 100644 index 0000000..293b046 --- /dev/null +++ b/classText.py @@ -0,0 +1,69 @@ +""" +Sarah Barden +This is a class that creates Text objects for my text mining project. It +includes two methods called cleanText and longestWords. +""" +import pickle + + +class Text: + + def __init__(self, title, author, fileName): + self.title = title + self.author = author + self.fileName = fileName + + def __str__(self): + string = '' + string += '{} by {}'.format(self.title, self.author) + return string + + def cleanText(self): + """ + Takes a Gutenberg text file and cleans it, removes punctuation, etc. + Returns the cleaned text as a string + """ + inputFile = open(self.fileName, 'rb') + text = pickle.load(inputFile) + + # removing unwanted punctuation + punctuation = [',', ';', '.', '-', '--', '!', '?', ')', '(', "'", '@', '*'] + for mark in punctuation: + text = text.replace(mark, ' ') + + # Project Gutenberg text files have a long footer and header in every + # file. The following finds the end of the header and the start of the + # footer and removes those sections. + start = text.find('START OF THIS PROJECT GUTENBERG') + if start == -1: + start = text.find('END THE SMALL PRINT') + 150 # approx end of the header text + + end = text.find('END OF THIS PROJECT GUTENBERG') + if end == -1: + end = text.find('Ende dieses') # ending statement in German + + text = text[start:end] # Cut the text to remove footer/header + return text + + def longestWords(self, number): + """ + Takes in a Gutenberg text file and outputs the longest n words, where n + is an input. Returns a list of tuples, where each tuple is the length + of the word and the word as a string + """ + + text = self.cleanText() + text = text.split() + words = [] + # After splitting the whole text into a list of words, this sorts them + # all by the length into a list, from longest to shortest. + for word in text: + length = len(word) + words.append((length, word)) + words.sort(reverse=True) + top = words[0:number] + return top + + +if __name__ == '__main__': + pass diff --git a/geschwister.txt b/geschwister.txt deleted file mode 100644 index 4bd2d0a..0000000 Binary files a/geschwister.txt and /dev/null differ diff --git a/iphigenie.txt b/iphigenie.txt deleted file mode 100644 index bc3370f..0000000 Binary files a/iphigenie.txt and /dev/null differ diff --git a/reinekefuchs.txt b/reinekefuchs.txt deleted file mode 100644 index 4bebfbc..0000000 Binary files a/reinekefuchs.txt and /dev/null differ diff --git a/textmining_processing.py b/textmining_processing.py index ba59ad6..23dfdf3 100644 --- a/textmining_processing.py +++ b/textmining_processing.py @@ -1,60 +1,45 @@ - -import pickle -import string - """ -Finding the longest word +Sarah Barden +This script is the final processing for my text mining project. It initializes +seven different works by Goethe as Text objects (see classText.py) and finds +the three longest words in each. """ -plays = ['geschwister.txt', 'berlichingen.txt', 'iphigenie.txt'] -prose = ['reinekefuchs.txt'] -book = ['werther1.txt', 'werther2.txt'] - - -def clean_text(file_name): - input_file = open(file_name, 'rb') - text = pickle.load(input_file) - if file_name in plays: - start = text.find('Personen') # beginning of play - elif file_name in prose: - start = text.find('Inhalt') # beginning of prose - elif file_name in book: - start = text.find('Ausgabe') # beginning of prose - text = text[start:] - text = text.replace(',', '') - text = text.replace('.', '') - text = text.replace('--', ' ') - text = text.replace('!', '') - text = text.replace('?', '') - text = text.replace(')', '') - text = text.replace('(', '') - text = text.replace("'", '') - print(file_name) - return text - - -def longest_words(text): - text = text.split() - # print(text) - words = [] - for word in text: - length = len(word) - words.append((length, word)) - words.sort(reverse=True) - print(words) - top = words[0:4] - return top - - -def final_analysis(text): - clean = clean_text(text) - top = longest_words(clean) - for pair in top: - print(pair) - - -# final_analysis('berlichingen.txt') -final_analysis('geschwister.txt') -# final_analysis('iphigenie.txt') -# final_analysis('reinekefuchs.txt') -# final_analysis('werther1.txt') -# final_analysis('werther2.txt') + +from classText import Text + +geschwister = Text('Die Geschwister', 'Johann Wolfgang von Goethe', 'geschwister.txt') +berlichingen = Text('Götz von Berlichingen', 'Johann Wolfgang von Goethe', 'berlichingen.txt') +iphigenie = Text('Iphigenie auf Tauris', 'Johann Wolfgang von Goethe', 'iphigenie.txt') +reinekefuchs = Text('Reineke Fuchs', 'Johann Wolfgang von Goethe', 'reinekefuchs.txt') +werther1 = Text('Die Leiden des jungen Werthers 1', 'Johann Wolfgang von Goethe', 'werther1.txt') +werther2 = Text('Die Leiden des jungen Werthers 2', 'Johann Wolfgang von Goethe', 'werther2.txt') + +works = [geschwister, berlichingen, iphigenie, reinekefuchs, + werther1, werther2] + + +def analyze(text): + """ + Find the longest three words in a single text. Takes a Text object as input + and outputs a list of words and their lengths. + """ + wordsAndLengths = text.longestWords(3) + wordsOnly = [word[1] for word in wordsAndLengths] + return wordsOnly + + +def analyzeAll(works): + """ + Finds the longest three words for each work in a list of multiple works. + Takes in a list of Text objects and outputs a list of words and their lengths. + """ + result = [analyze(work) for work in works] + return result + + +# running full analysis on all seven works initialized above. Prints each word +# in the console. +final = analyzeAll(works) +for work in final: + for word in work: + print(word) diff --git a/textmining_requests.py b/textmining_requests.py index 282ff79..c875b1c 100644 --- a/textmining_requests.py +++ b/textmining_requests.py @@ -1,3 +1,7 @@ +""" +Sarah Barden +Requests project gutenberg pages. There are six requests for Goethe's works. +""" import pickle import requests diff --git a/werther1.txt b/werther1.txt deleted file mode 100644 index 74ca30a..0000000 Binary files a/werther1.txt and /dev/null differ diff --git a/werther2.txt b/werther2.txt deleted file mode 100644 index 59b619b..0000000 Binary files a/werther2.txt and /dev/null differ