diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index eefca6a..712ff5d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,29 +1,29 @@ -name: twitter-data-analysis - -on: - push: - branches: [main] - pull_request: - branches: [main] - -permissions: - contents: read - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Test with pytest - run: | - python -m pytest +name: twitter-data-analysis + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: | + python -m pytest diff --git a/.gitignore b/.gitignore index 54e6782..7081366 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -__pycache__/ -data/ \ No newline at end of file +__pycache__/ +data/ +.ipynb_checkpoints diff --git a/LICENSE b/LICENSE index a13471e..e3b94af 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2022 10 Academy - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2022 10 Academy + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index ba4e845..b64ff6b 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,29 @@ -# Twitter-Data-Analysis - -### So here are the bare minimum requirement for completing this task - -1. Use this template to create a repository called Twitter-Data-Analysis in your github account. See ["Creating a repository from a template."](https://docs.github.com/en/articles/creating-a-repository-from-a-template) for more information. -2. [Download](https://drive.google.com/drive/folders/19G8dmehf9vU0u6VTKGV-yWsQOn3IvPsd) and extract the necessary data and put it in the data directory. The data should not not be added to git tracking. -3. Create a branch called “bugfix” to fix the bugs in the fix_clean_tweets_dataframe.py and fix_extract_dataframe.py -4. In branch “bugfix” use the git mv command to rename fix_clean_tweets_dataframe.py to clean_tweets_dataframe.py and fix_extract_dataframe.py to extract_dataframe.py -5. Fix the bugs on clean_tweets_dataframe.py and extract_dataframe.py -6. Multiple times, push the code you are working on to git, and once the fix is complete, merge the fix_bug branch to main branch -7. Create a new branch called “testing” for updating the unit tests in the test/ folder to be applicable to the code you fixed. - a. Build your unit and integration tests to run on small data (< 1 MB) that you copied from what is provided - avoid pushing large data to github - b. Think about the key elements (units can be functions, classes, or modules; multiple of them working together to accomplish a task requires integration testing) of the code base you are working on. Write the following - - Unit tests: for individual key functions and classes - - Integration tests: for the integration of multiple units working together -8. After completing the unit and integration tests, merge the “testing” branch with the main branch -9. In all cases when you merge, make sure you first do Pull Request, review, then accept the merge. -10. Use github actions in your repository such that when you git push new code (or merge a branch) to the main branch, the unit test in tests/*.py runs automatically. All tests should pass. - - -After Completing this Challenge, you would have explore - -- Unittesting -- Modular Coding -- Software Engineering Best Practices -- Python Package Structure -- Bug Fix (Debugging) - -Have Fun and Cheers +# Twitter-Data-Analysis + +### So here are the bare minimum requirement for completing this task + +1. Use this template to create a repository called Twitter-Data-Analysis in your github account. See ["Creating a repository from a template."](https://docs.github.com/en/articles/creating-a-repository-from-a-template) for more information. +2. [Download](https://drive.google.com/drive/folders/19G8dmehf9vU0u6VTKGV-yWsQOn3IvPsd) and extract the necessary data and put it in the data directory. The data should not not be added to git tracking. +3. Create a branch called “bugfix” to fix the bugs in the fix_clean_tweets_dataframe.py and fix_extract_dataframe.py +4. In branch “bugfix” use the git mv command to rename fix_clean_tweets_dataframe.py to clean_tweets_dataframe.py and fix_extract_dataframe.py to extract_dataframe.py +5. Fix the bugs on clean_tweets_dataframe.py and extract_dataframe.py +6. Multiple times, push the code you are working on to git, and once the fix is complete, merge the fix_bug branch to main branch +7. Create a new branch called “testing” for updating the unit tests in the test/ folder to be applicable to the code you fixed. + a. Build your unit and integration tests to run on small data (< 1 MB) that you copied from what is provided - avoid pushing large data to github + b. Think about the key elements (units can be functions, classes, or modules; multiple of them working together to accomplish a task requires integration testing) of the code base you are working on. Write the following + - Unit tests: for individual key functions and classes + - Integration tests: for the integration of multiple units working together +8. After completing the unit and integration tests, merge the “testing” branch with the main branch +9. In all cases when you merge, make sure you first do Pull Request, review, then accept the merge. +10. Use github actions in your repository such that when you git push new code (or merge a branch) to the main branch, the unit test in tests/*.py runs automatically. All tests should pass. + + +After Completing this Challenge, you would have explore + +- Unittesting +- Modular Coding +- Software Engineering Best Practices +- Python Package Structure +- Bug Fix (Debugging) + +Have Fun and Cheers diff --git a/clean_tweets_dataframe.py b/clean_tweets_dataframe.py new file mode 100644 index 0000000..49f6602 --- /dev/null +++ b/clean_tweets_dataframe.py @@ -0,0 +1,143 @@ +import re +import pandas as pd +from defaults import * + +class Clean_Tweets: + """ + The PEP8 Standard AMAZING!!! + """ + def __init__(self, df:pd.DataFrame): + self.df = df + print('Automation in Action...!!!') + + def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame: + """ + remove rows that has column names. This error originated from + the data collection stage. + """ + unwanted_rows = self.df[self.df['retweet_count'] == 'retweet_count' ].index + self.df.drop(unwanted_rows , inplace=True) + self.df = self.df[self.df['polarity'] != 'polarity'] + return df + + def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame: + """ + drop duplicate rows + """ + self.df.drop_duplicates(subset='original_text', inplace=True) + return df + + def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame: + """ + convert column to datetime + """ + self.df['created_at'] = pd.to_datetime(self.df['created_at'], errors='coerce') + return df + + def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame: + """ + convert columns like polarity, subjectivity, retweet_count + favorite_count etc to numbers + """ + self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce') + self.df['subjectivity'] = pd.to_numeric(self.df['subjectivity'], + errors='coerce') + self.df['listed_count'] = pd.to_numeric(self.df['listed_count'], + errors='coerce') + self.df['retweet_count'] = pd.to_numeric(self.df['retweet_count'], + errors='coerce') + self.df['friends_count'] = pd.to_numeric(self.df['friends_count'], + errors='coerce') + self.df['favorite_count'] = pd.to_numeric(self.df['favorite_count'], + errors='coerce') + self.df['statuses_count'] = pd.to_numeric(self.df['statuses_count'], + errors='coerce') + self.df['followers_count'] = pd.to_numeric(self.df['followers_count'], + errors='coerce') + self.df['polarity'] = pd.to_numeric(self.df['polarity'], + errors='coerce') + return df + + def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame: + """ + remove non english tweets from lang + """ + self.df.query("lang == 'en'", inplace=True) + return df + + def drop_nulls(self, df: pd.DataFrame) -> pd.DataFrame: + """ + drop nulls + """ + self.df = self.df.dropna(axis=0, how='any', inplace=False) + return df + + def find_hashtags(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Method to find hashtags from tweets + This function will extract hashtags + """ + self.df = re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', df) + return df + + def text_category(self, series: pd.Series) -> list: + """ + function that return positive, negative or neutral based on polarity + """ + polarities = [] + for pol in series: + if pol >= 0.00000000001: + polarities.append("positive") + elif pol == 0.00000000000: + polarities.append("neutral") + elif pol <= -0.00000000001: + polarities.append("negative") + else: + polarities.append('UNK') + return polarities + + def fill_missing(self, df: pd.DataFrame, column: str, value): + """ + fill null values of a specific column with the provided value + """ + + df[column] = df[column].fillna(value) + + return df + + def replace_empty_string(self, df:pd.DataFrame, column: str, value: str): + """ + replace empty strings in a specific column with the provided value + """ + + df[column] = df[column].apply(lambda x: value if x == "" else x) + + return df + + def remove_characters(self, df: pd.DataFrame, column: str): + """ + removes non-alphanumeric characters with the exception of underscore hyphen and space + from the specified column + """ + + df[column] = df[column].apply(lambda text: re.sub("[^a-zA-Z0-9\s_-]", "", text)) + + return df + + def extract_device_name(self, source: str): + """ + returns device name from source text + """ + res = re.split('<|>', source)[2].strip() + return + +if __name__ == "__main__": + """ + read the twitter dataset and Pass the data to the Clean_Tweets + class + """ + global_tweet_df = pd.read_json(global_data, lines=True) + global_cleaner = Clean_Tweets(global_tweet_df) + + african_tweet_df = pd.read_json(african_data, lines=True) + african_cleaner = Clean_Tweets(african_tweet_df) diff --git a/defaults.py b/defaults.py new file mode 100644 index 0000000..c36af2f --- /dev/null +++ b/defaults.py @@ -0,0 +1,16 @@ +""" +A script to store all default paths and strings. +""" + +# the global data set +global_data = 'data/global_twitter_data.json' + +# the processed global data set +processed_global_data = 'data/processed_global_tweet_data.json' + + +# the african data set +african_data = 'data/africa_twitter_data.json' + +# the processed african data set +processed_african_data = 'data/processed_africa_tweet_data.json' diff --git a/extract_dataframe.py b/extract_dataframe.py new file mode 100644 index 0000000..add5256 --- /dev/null +++ b/extract_dataframe.py @@ -0,0 +1,277 @@ +import json +import pandas as pd +import numpy as np +from textblob import TextBlob +from defaults import * + + +def read_json(json_file: str) -> list: + """ + json file reader to open and read json files into a list + Args: + ----- + json_file: str - path of a json file + + Returns + ------- + length of the json file and a list of json + """ + + tweets_data = [] + for tweets in open(json_file, 'r'): + tweets_data.append(json.loads(tweets)) + return len(tweets_data), tweets_data + +class TweetDfExtractor: + """ + this function will parse tweets json into a pandas dataframe + + Return + ------ + dataframe + """ + def __init__(self, tweets_list): + """ + The initializer for the TweetDf Extractor class + """ + self.tweets_list = tweets_list + + def find_statuses_count(self)->list: + """ + an example function + """ + statuses_count = [x['user']['statuses_count'] + for x in self.tweets_list] + return statuses_count + + def find_full_text(self)->list: + """ + a function to find and return full text of a twit from a dataframe + """ + text = [] + for x in self.tweets_list: + try: + text.append(x['full_text']) + except KeyError: + #text.append(x['text']) + text.append('NA') + return text + + def find_sentiments(self, text)->list: + """ + a function to find and return polarity and subjectivity of a twit + """ + polarity = [TextBlob(x).polarity for x in text] + subjectivity = [TextBlob(x).subjectivity for x in text] + return (polarity, subjectivity) + + def find_created_time(self)->list: + """ + a function to find and return the date the twit was created at + """ + created_at = [x['created_at'] for x in self.tweets_list] + return created_at + + def find_source(self)->list: + """ + a function to find and return the source of a tweet + """ + source = [x['source'] for x in self.tweets_list] + return source + + def find_screen_name(self)->list: + """ + a function to find and return the screen name from where the + tweet originated + """ + screen_name = [x['user']['screen_name'] for x in self.tweets_list] + return screen_name + + def find_followers_count(self)->list: + """ + function to find and return the follower count of a twitter + """ + followers_count = [x['user']['followers_count'] for x in + self.tweets_list] + return followers_count + + def find_friends_count(self)->list: + """ + function to find and return the friends count of a twitter + """ + friends_count = [x['user']['friends_count'] for x in self.tweets_list] + return friends_count + + def is_sensitive(self)->list: + """ + try: + is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list] + except KeyError: + is_sensitive = None + return is_sensitive + """ + # function to find and return the possible sensitivity of a tweet + is_sensitive = [] + for tweet in self.tweets_list: + if 'possibly_sensitive' in tweet.keys(): + is_sensitive.append(tweet['possibly_sensitive']) + else: + is_sensitive.append(None) + return is_sensitive + + def find_favorite_count(self)->list: + """ + function to find and return the favorite count of a tweet + """ + favorite_count = [] + for tweet in self.tweets_list: + if 'retweeted_status' in tweet.keys(): + favorite_count.append( + tweet['retweeted_status']['favorite_count']) + else: + favorite_count.append(0) + return favorite_count + + def find_retweet_count(self)->list: + """ + function to find and return the retweet count of a tweet + """ + retweet_count = [] + for tweet in self.tweets_list: + if 'retweeted_status' in tweet.keys(): + retweet_count.append( + tweet['retweeted_status']['retweet_count']) + else: + retweet_count.append(0) + return retweet_count + + def find_hashtags(self)->list: + """ + function to find and return the hashtags of a tweet + """ + hashtags = [x['entities']['hashtags'] for x in self.tweets_list] + return hashtags + + def find_mentions(self)->list: + """ + function to find and return the mentions of a tweet + """ + mentions = [x['entities']['user_mentions'] for x in self.tweets_list] + return mentions + + def find_location(self)->list: + """ + function to find and return the location of a tweet + """ + location = [x.get('user', {}).get('location', None) for x in + self.tweets_list] + return location + + def find_lang(self) -> list: + """ + function to find and return the language of a tweet + """ + lang = [x['lang'] for x in self.tweets_list] + return lang + + # TODO : make this method + def find_authors(self) -> list: + """ + function to find and return authors of tweets + """ + authors = [] + for x in range(22000): + authors.append(x) + return authors + + def get_tweet_df(self, save: bool=False, save_as : str = 'processed_tweet_data', as_csv : bool = False) -> pd.DataFrame: + """ + required columns to be generated + """ + # added_column_Names = ['status_count', 'screen_name'] + selected_columns = ['created_at', 'source', 'original_text','polarity', + 'subjectivity', 'lang', 'favorite_count', 'status_count', + 'retweet_count', 'screen_name', 'original_author', + 'followers_count','friends_count','possibly_sensitive', + 'hashtags', 'user_mentions', 'place'] + + created_at = self.find_created_time() + source = self.find_source() + text = self.find_full_text() + polarity, subjectivity = self.find_sentiments(text) + lang = self.find_lang() + fav_count = self.find_favorite_count() + status_count = self.find_statuses_count() + retweet_count = self.find_retweet_count() + screen_name = self.find_screen_name() + author = self.find_screen_name() + followers_count = self.find_followers_count() + friends_count = self.find_friends_count() + sensitivity = self.is_sensitive() + hashtags = self.find_hashtags() + mentions = self.find_mentions() + location = self.find_location() + + selected_data = [created_at, source, text, polarity, subjectivity, lang, fav_count, status_count, retweet_count, screen_name, author, followers_count, friends_count, sensitivity, hashtags, mentions, location] + + sel_data = {} + for i in range(0, len(selected_columns), 1): + sel_data[selected_columns[i]] = selected_data[i] + + final_dataframe = pd.DataFrame(data = sel_data) + + """print({len(status_count)}, {len(created_at)}, {len(source)}, + {len(text)}, {len(polarity)}, {len(subjectivity)}, + {len(fav_count)}, {len(retweet_count)}, {len(screen_name)}, + {len(followers_count)}, {len(friends_count)}, + {len(sensitivity)}, {len(hashtags)}, {len(mentions)}, + {len(location)}, {len(lang)}, {len(author)})""" + + """print(status_count, created_at, source, + text, polarity, subjectivity, + fav_count, retweet_count, screen_name, + followers_count, friends_count, + sensitivity, hashtags, mentions, + location, lang, author)""" + + """print({type(status_count)}, {type(created_at)}, {type(source)}, + {type(text)}, {type(polarity)}, {type(subjectivity)}, + {type(fav_count)}, {type(retweet_count)}, {type(screen_name)}, + {type(followers_count)}, {type(friends_count)}, + {type(sensitivity)}, {type(hashtags)}, {type(mentions)}, + {type(location)}, {type(lang)}, {type(author)})""" + + if save: + if as_csv: + data_path = 'data/' + save_as + '.csv' + final_dataframe.to_csv(data_path, index=False) + print(f'File {save_as} successfully saved as {data_path}') + else: + data_path = 'data/' + save_as + '.json' + final_dataframe.to_json(data_path, indent=4) + print(f'File {save_as} successfully saved as {data_path}') + return final_dataframe + + +if __name__ == "__main__": + # required column to be generated you should be creative and add more features + columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', + 'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] + + # for the global data set + _, global_tweet_list = read_json(global_data) + # to make sure all the data is passe to he + print(f"Total number of data: {_}") + global_tweet = TweetDfExtractor(global_tweet_list) + global_tweet_df = global_tweet.get_tweet_df(save= True, save_as='processed_global_tweet_data') + print(global_tweet_df) + + """# for the african data set + _, african_tweet_list = read_json(african_data) + # to make sure all the data is passe to he + print(f"Total number of data: {_}") + african_tweet = TweetDfExtractor(african_tweet_list) + african_tweet_df = african_tweet.get_tweet_df(save = True, save_as='processed_african_tweet_data') + print(african_tweet_df)""" + + # TODO : use all defined functions to generate a dataframe with the specified columns above diff --git a/fix_clean_tweets_dataframe.py b/fix_clean_tweets_dataframe.py deleted file mode 100644 index 7b45a35..0000000 --- a/fix_clean_tweets_dataframe.py +++ /dev/null @@ -1,58 +0,0 @@ -class Clean_Tweets: - """ - The PEP8 Standard AMAZING!!! - """ - def __init__(self, df:pd.DataFrame): - self.df = df - print('Automation in Action...!!!') - - def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame: - """ - remove rows that has column names. This error originated from - the data collection stage. - """ - unwanted_rows = df[df['retweet_count'] == 'retweet_count' ].index - df.drop(unwanted_rows , inplace=True) - df = df[df['polarity'] != 'polarity'] - - return df - def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame: - """ - drop duplicate rows - """ - - --- - - return df - def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame: - """ - convert column to datetime - """ - ---- - - ---- - - df = df[df['created_at'] >= '2020-12-31' ] - - return df - - def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame: - """ - convert columns like polarity, subjectivity, retweet_count - favorite_count etc to numbers - """ - df['polarity'] = pd.---- - - ---- - ---- - - return df - - def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame: - """ - remove non english tweets from lang - """ - - df = ---- - - return df \ No newline at end of file diff --git a/fix_extract_dataframe.py b/fix_extract_dataframe.py deleted file mode 100644 index 3bd792d..0000000 --- a/fix_extract_dataframe.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -import pandas as pd -from textblob import TextBlob - - -def read_json(json_file: str)->list: - """ - json file reader to open and read json files into a list - Args: - ----- - json_file: str - path of a json file - - Returns - ------- - length of the json file and a list of json - """ - - tweets_data = [] - for tweets in open(json_file,'r'): - tweets_data.append(json.loads(tweets)) - - - return len(tweets_data), tweets_data - -class TweetDfExtractor: - """ - this function will parse tweets json into a pandas dataframe - - Return - ------ - dataframe - """ - def __init__(self, tweets_list): - - self.tweets_list = tweets_list - - # an example function - def find_statuses_count(self)->list: - statuses_count - - def find_full_text(self)->list: - text = - - - def find_sentiments(self, text)->list: - - return polarity, self.subjectivity - - def find_created_time(self)->list: - - return created_at - - def find_source(self)->list: - source = - - return source - - def find_screen_name(self)->list: - screen_name = - - def find_followers_count(self)->list: - followers_count = - - def find_friends_count(self)->list: - friends_count = - - def is_sensitive(self)->list: - try: - is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list] - except KeyError: - is_sensitive = None - - return is_sensitive - - def find_favourite_count(self)->list: - - - def find_retweet_count(self)->list: - retweet_count = - - def find_hashtags(self)->list: - hashtags = - - def find_mentions(self)->list: - mentions = - - - def find_location(self)->list: - try: - location = self.tweets_list['user']['location'] - except TypeError: - location = '' - - return location - - - - - def get_tweet_df(self, save=False)->pd.DataFrame: - """required column to be generated you should be creative and add more features""" - - columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', - 'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place'] - - created_at = self.find_created_time() - source = self.find_source() - text = self.find_full_text() - polarity, subjectivity = self.find_sentiments(text) - lang = self.find_lang() - fav_count = self.find_favourite_count() - retweet_count = self.find_retweet_count() - screen_name = self.find_screen_name() - follower_count = self.find_followers_count() - friends_count = self.find_friends_count() - sensitivity = self.is_sensitive() - hashtags = self.find_hashtags() - mentions = self.find_mentions() - location = self.find_location() - data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location) - df = pd.DataFrame(data=data, columns=columns) - - if save: - df.to_csv('processed_tweet_data.csv', index=False) - print('File Successfully Saved.!!!') - - return df - - -if __name__ == "__main__": - # required column to be generated you should be creative and add more features - columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', - 'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] - _, tweet_list = read_json("../covid19.json") - tweet = TweetDfExtractor(tweet_list) - tweet_df = tweet.get_tweet_df() - - # use all defined functions to generate a dataframe with the specified columns above \ No newline at end of file diff --git a/notebooks/EDA.ipynb b/notebooks/EDA.ipynb new file mode 100644 index 0000000..4435bdc --- /dev/null +++ b/notebooks/EDA.ipynb @@ -0,0 +1,1623 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "# imports\n", + "import pandas as pd\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import custom libraries and scripts\n", + "# sys.path.append(os.path.abspath(os.path.join(\"../..\")))\n", + "sys.path.append(\".\")\n", + "sys.path.append(\"..\")\n", + "\n", + "from defaults import *\n", + "from extract_dataframe import read_json\n", + "from extract_dataframe import TweetDfExtractor\n", + "from clean_tweets_dataframe import Clean_Tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# read processed data set\n", + "tweets_df = pd.read_csv('../data/clean_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atsourceoriginal_textpolaritysubjectivitylangfavorite_countstatus_countretweet_countscreen_nameoriginal_authorfollowers_countfriends_countpossibly_sensitivehashtagsuser_mentionsplace
02022-08-07 22:31:20+00:00Twitter for AndroidRT @i_ameztoy: Extra random image (I):\\n\\nLets...-1.250000e-010.190625en480972i_ameztoyi_ameztoy204972621unknown[{'text': 'City', 'indices': [132, 137]}][{'screen_name': 'i_ameztoy', 'name': 'Iban Am...unknown
12022-08-07 22:31:16+00:00Twitter for AndroidRT @IndoPac_Info: #China's media explains the ...-1.000000e-010.100000en6915831201ZIisqZIisq65272unknown[{'text': 'China', 'indices': [18, 24]}, {'tex...[{'screen_name': 'IndoPac_Info', 'name': 'Indo...unknown
22022-08-07 22:31:07+00:00Twitter for AndroidChina even cut off communication, they don't a...0.000000e+000.000000en016270Fin21FreeFin21Free85392unknown[{'text': 'XiJinping', 'indices': [127, 137]}][{'screen_name': 'ZelenskyyUa', 'name': 'Волод...Netherlands
32022-08-07 22:31:06+00:00Twitter for AndroidPutin to #XiJinping : I told you my friend, Ta...1.000000e-010.350000en016270Fin21FreeFin21Free85392unknown[{'text': 'XiJinping', 'indices': [9, 19]}][]Netherlands
42022-08-07 22:31:04+00:00Twitter for iPhoneRT @ChinaUncensored: I’m sorry, I thought Taiw...-6.938894e-180.556250en152118958381VizziniDoloresVizziniDolores9102608unknown[][{'screen_name': 'ChinaUncensored', 'name': 'C...Ayent, Schweiz
52022-08-07 22:31:02+00:00Twitter for AndroidRT @benedictrogers: We must not let this happe...2.000000e-010.500000en1164848336GraceCh15554845GraceCh15554845207540.0[{'text': 'Taiwan', 'indices': [84, 91]}][{'screen_name': 'benedictrogers', 'name': 'Be...Melbourne, Victoria
62022-08-07 22:30:59+00:00Twitter for AndroidRT @TGTM_Official: What kind of country can co...1.583333e-010.800000en11064173411Philipkuma1Philipkuma112264unknown[{'text': 'Taiwan', 'indices': [101, 108]}, {'...[{'screen_name': 'TGTM_Official', 'name': 'The...unknown
72022-08-07 22:30:59+00:00Twitter for AndroidRT @ChinaInfo777: #PinkFloyd singer Roger Wate...0.000000e+000.000000en10241025nhohn2011nhohn2011870508unknown[{'text': 'PinkFloyd', 'indices': [18, 28]}, {...[{'screen_name': 'ChinaInfo777', 'name': 'Chin...Florida, USA
82022-08-07 22:30:50+00:00Twitter for AndroidRT @AmbQinGang: China's SC&amp;FM Wang Yi elab...0.000000e+000.000000en1221630239ClaudioColomaRIClaudioColomaRI127263unknown[{'text': 'Taiwan', 'indices': [80, 87]}][{'screen_name': 'AmbQinGang', 'name': 'Qin Ga...El mundo periférico
92022-08-07 22:30:45+00:00Twitter Web AppRT @CGMeifangZhang: Chinese ambassador to the ...2.000000e-010.375000en4910718825jmarzola1jmarzola1213877unknown[{'text': 'USA', 'indices': [66, 70]}, {'text'...[{'screen_name': 'CGMeifangZhang', 'name': 'Zh...unknown
\n", + "
" + ], + "text/plain": [ + " created_at source \\\n", + "0 2022-08-07 22:31:20+00:00 Twitter for Android \n", + "1 2022-08-07 22:31:16+00:00 Twitter for Android \n", + "2 2022-08-07 22:31:07+00:00 Twitter for Android \n", + "3 2022-08-07 22:31:06+00:00 Twitter for Android \n", + "4 2022-08-07 22:31:04+00:00 Twitter for iPhone \n", + "5 2022-08-07 22:31:02+00:00 Twitter for Android \n", + "6 2022-08-07 22:30:59+00:00 Twitter for Android \n", + "7 2022-08-07 22:30:59+00:00 Twitter for Android \n", + "8 2022-08-07 22:30:50+00:00 Twitter for Android \n", + "9 2022-08-07 22:30:45+00:00 Twitter Web App \n", + "\n", + " original_text polarity \\\n", + "0 RT @i_ameztoy: Extra random image (I):\\n\\nLets... -1.250000e-01 \n", + "1 RT @IndoPac_Info: #China's media explains the ... -1.000000e-01 \n", + "2 China even cut off communication, they don't a... 0.000000e+00 \n", + "3 Putin to #XiJinping : I told you my friend, Ta... 1.000000e-01 \n", + "4 RT @ChinaUncensored: I’m sorry, I thought Taiw... -6.938894e-18 \n", + "5 RT @benedictrogers: We must not let this happe... 2.000000e-01 \n", + "6 RT @TGTM_Official: What kind of country can co... 1.583333e-01 \n", + "7 RT @ChinaInfo777: #PinkFloyd singer Roger Wate... 0.000000e+00 \n", + "8 RT @AmbQinGang: China's SC&FM Wang Yi elab... 0.000000e+00 \n", + "9 RT @CGMeifangZhang: Chinese ambassador to the ... 2.000000e-01 \n", + "\n", + " subjectivity lang favorite_count status_count retweet_count \\\n", + "0 0.190625 en 4 8097 2 \n", + "1 0.100000 en 691 5831 201 \n", + "2 0.000000 en 0 1627 0 \n", + "3 0.350000 en 0 1627 0 \n", + "4 0.556250 en 1521 18958 381 \n", + "5 0.500000 en 116 48483 36 \n", + "6 0.800000 en 1106 4173 411 \n", + "7 0.000000 en 10 24102 5 \n", + "8 0.000000 en 1221 630 239 \n", + "9 0.375000 en 49 107188 25 \n", + "\n", + " screen_name original_author followers_count friends_count \\\n", + "0 i_ameztoy i_ameztoy 20497 2621 \n", + "1 ZIisq ZIisq 65 272 \n", + "2 Fin21Free Fin21Free 85 392 \n", + "3 Fin21Free Fin21Free 85 392 \n", + "4 VizziniDolores VizziniDolores 910 2608 \n", + "5 GraceCh15554845 GraceCh15554845 207 54 \n", + "6 Philipkuma1 Philipkuma1 12 264 \n", + "7 nhohn2011 nhohn2011 870 508 \n", + "8 ClaudioColomaRI ClaudioColomaRI 127 263 \n", + "9 jmarzola1 jmarzola1 213 877 \n", + "\n", + " possibly_sensitive hashtags \\\n", + "0 unknown [{'text': 'City', 'indices': [132, 137]}] \n", + "1 unknown [{'text': 'China', 'indices': [18, 24]}, {'tex... \n", + "2 unknown [{'text': 'XiJinping', 'indices': [127, 137]}] \n", + "3 unknown [{'text': 'XiJinping', 'indices': [9, 19]}] \n", + "4 unknown [] \n", + "5 0.0 [{'text': 'Taiwan', 'indices': [84, 91]}] \n", + "6 unknown [{'text': 'Taiwan', 'indices': [101, 108]}, {'... \n", + "7 unknown [{'text': 'PinkFloyd', 'indices': [18, 28]}, {... \n", + "8 unknown [{'text': 'Taiwan', 'indices': [80, 87]}] \n", + "9 unknown [{'text': 'USA', 'indices': [66, 70]}, {'text'... \n", + "\n", + " user_mentions place \n", + "0 [{'screen_name': 'i_ameztoy', 'name': 'Iban Am... unknown \n", + "1 [{'screen_name': 'IndoPac_Info', 'name': 'Indo... unknown \n", + "2 [{'screen_name': 'ZelenskyyUa', 'name': 'Волод... Netherlands \n", + "3 [] Netherlands \n", + "4 [{'screen_name': 'ChinaUncensored', 'name': 'C... Ayent, Schweiz \n", + "5 [{'screen_name': 'benedictrogers', 'name': 'Be... Melbourne, Victoria \n", + "6 [{'screen_name': 'TGTM_Official', 'name': 'The... unknown \n", + "7 [{'screen_name': 'ChinaInfo777', 'name': 'Chin... Florida, USA \n", + "8 [{'screen_name': 'AmbQinGang', 'name': 'Qin Ga... El mundo periférico \n", + "9 [{'screen_name': 'CGMeifangZhang', 'name': 'Zh... unknown " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automation in Action...!!!\n" + ] + } + ], + "source": [ + "cleaner = Clean_Tweets(tweets_df.copy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Making explorations" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7440, 17)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# shape of dataframe\n", + "tweets_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 7440 entries, 0 to 7439\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 created_at 7440 non-null object \n", + " 1 source 7440 non-null object \n", + " 2 original_text 7440 non-null object \n", + " 3 polarity 7440 non-null float64\n", + " 4 subjectivity 7440 non-null float64\n", + " 5 lang 7440 non-null object \n", + " 6 favorite_count 7440 non-null int64 \n", + " 7 status_count 7440 non-null int64 \n", + " 8 retweet_count 7440 non-null int64 \n", + " 9 screen_name 7440 non-null object \n", + " 10 original_author 7440 non-null object \n", + " 11 followers_count 7440 non-null int64 \n", + " 12 friends_count 7440 non-null int64 \n", + " 13 possibly_sensitive 7440 non-null object \n", + " 14 hashtags 7440 non-null object \n", + " 15 user_mentions 7440 non-null object \n", + " 16 place 7440 non-null object \n", + "dtypes: float64(2), int64(5), object(10)\n", + "memory usage: 988.2+ KB\n" + ] + } + ], + "source": [ + "tweets_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "created_at 0\n", + "source 0\n", + "original_text 0\n", + "polarity 0\n", + "subjectivity 0\n", + "lang 0\n", + "favorite_count 0\n", + "status_count 0\n", + "retweet_count 0\n", + "screen_name 0\n", + "original_author 0\n", + "followers_count 0\n", + "friends_count 0\n", + "possibly_sensitive 0\n", + "hashtags 0\n", + "user_mentions 0\n", + "place 0\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
polaritysubjectivityfavorite_countstatus_countretweet_countfollowers_countfriends_count
count7440.0000007440.0000007440.0000007.440000e+037440.0000007.440000e+037440.000000
mean0.0567830.295638203.3512104.900565e+0438.7131724.107761e+041715.558871
std0.2301560.2878051655.6901481.432954e+05326.7570254.910108e+055305.897528
min-1.0000000.0000000.0000001.000000e+000.0000000.000000e+000.000000
25%0.0000000.0000000.0000001.549250e+030.0000007.275000e+01106.000000
50%0.0000000.2500000.0000007.904000e+030.0000003.670000e+02440.000000
75%0.1363640.5000004.0000003.510900e+042.0000001.833000e+031505.000000
max1.0000001.00000065170.0000004.108317e+0617409.0000001.449852e+07208360.000000
\n", + "
" + ], + "text/plain": [ + " polarity subjectivity favorite_count status_count retweet_count \\\n", + "count 7440.000000 7440.000000 7440.000000 7.440000e+03 7440.000000 \n", + "mean 0.056783 0.295638 203.351210 4.900565e+04 38.713172 \n", + "std 0.230156 0.287805 1655.690148 1.432954e+05 326.757025 \n", + "min -1.000000 0.000000 0.000000 1.000000e+00 0.000000 \n", + "25% 0.000000 0.000000 0.000000 1.549250e+03 0.000000 \n", + "50% 0.000000 0.250000 0.000000 7.904000e+03 0.000000 \n", + "75% 0.136364 0.500000 4.000000 3.510900e+04 2.000000 \n", + "max 1.000000 1.000000 65170.000000 4.108317e+06 17409.000000 \n", + "\n", + " followers_count friends_count \n", + "count 7.440000e+03 7440.000000 \n", + "mean 4.107761e+04 1715.558871 \n", + "std 4.910108e+05 5305.897528 \n", + "min 0.000000e+00 0.000000 \n", + "25% 7.275000e+01 106.000000 \n", + "50% 3.670000e+02 440.000000 \n", + "75% 1.833000e+03 1505.000000 \n", + "max 1.449852e+07 208360.000000 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# stats about numerical columns\n", + "tweets_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Univariate" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 i_ameztoy\n", + "1 ZIisq\n", + "2 Fin21Free\n", + "3 Fin21Free\n", + "4 VizziniDolores\n", + " ... \n", + "7435 PelosiLibArmy\n", + "7436 SonnyMullins13\n", + "7437 TECO_Toronto\n", + "7438 samserjio93\n", + "7439 ZeitounRimal\n", + "Name: original_author, Length: 7440, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.original_author" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TrumpThuan 116\n", + "AarianNewsX 57\n", + "CGMeifangZhang 43\n", + "SoizaDavid 42\n", + "doos94619918 36\n", + " ... \n", + "AoxiPRNew 1\n", + "Eloy_Sauvan 1\n", + "carnivorecabbie 1\n", + "FarmSailing 1\n", + "ZeitounRimal 1\n", + "Name: original_author, Length: 4624, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.original_author.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.tick_params(axis='x', labelsize=10)\n", + "ax.tick_params(axis='y', labelsize=10)\n", + "ax.set_xlabel('Twitter', fontsize=10)\n", + "ax.set_ylabel('Number of tweets' , fontsize=10)\n", + "ax.set_title('Top 10 Tweeters', fontsize=10)\n", + "tweets_df.original_author.value_counts()[:10].plot(ax=ax, kind='bar', color='orange')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### locations" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "unknown 2805\n", + "Việt Nam 116\n", + "India 107\n", + "United States 72\n", + "Turn on 🔔 57\n", + " ... \n", + "New York, New York 1\n", + "Fontaines-Saint-Martin, France 1\n", + "🇺🇲🇺🇲🇺🇲 1\n", + "Lisbon 1\n", + "🇺🇲🇷🇺🇺🇦🇫🇷🇦🇪🇮🇱🏳️‍🌈 1\n", + "Name: place, Length: 1809, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.place.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most locations are unknown" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# top 4 locations of users\n", + "fig, ax = plt.subplots()\n", + "ax.tick_params(axis='x', labelsize=10)\n", + "ax.tick_params(axis='y', labelsize=10)\n", + "ax.set_xlabel('Twitters', fontsize=10)\n", + "ax.set_ylabel('Number of locations' , fontsize=10)\n", + "ax.set_title('Top 4 Locations', fontsize=10)\n", + "tweets_df.place.value_counts()[:4].plot(ax=ax, kind='bar', color='orange')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### source" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Twitter Web App 2717\n", + "Twitter for Android 2360\n", + "Twitter for iPhone 1531\n", + "Twitter for iPad 191\n", + "TweetDeck 127\n", + "Name: source, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df[\"source\"].value_counts()[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The top five sources of tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# top 5 sources of users\n", + "fig, ax = plt.subplots()\n", + "ax.tick_params(axis='x', labelsize=10)\n", + "ax.tick_params(axis='y', labelsize=10)\n", + "ax.set_xlabel('Twitters', fontsize=10)\n", + "ax.set_ylabel('Number of sources' , fontsize=10)\n", + "ax.set_title('Top 5 sources', fontsize=10)\n", + "tweets_df.source.value_counts()[:5].plot(ax=ax, kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Possibly sensitive" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 3866\n", + "unknown 3463\n", + "1.0 111\n", + "Name: possibly_sensitive, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df[\"possibly_sensitive\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tweets_df[\"possibly_sensitive\"].value_counts().plot(kind=\"pie\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Polarity and subjectivity" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " 0.000000 2894\n", + "-0.100000 269\n", + " 0.500000 225\n", + "-0.050000 188\n", + " 0.200000 178\n", + " ... \n", + " 0.151667 1\n", + "-0.190000 1\n", + "-0.140136 1\n", + " 0.013624 1\n", + " 0.207143 1\n", + "Name: polarity, Length: 760, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df[\"polarity\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Sentiments" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pols = cleaner.text_category(series= tweets_df.polarity)\n", + "pols = pd.Series(pols)\n", + "\n", + "# top 5 sources of users\n", + "fig, ax = plt.subplots()\n", + "ax.tick_params(axis='x', labelsize=10)\n", + "ax.tick_params(axis='y', labelsize=10)\n", + "ax.set_xlabel('Values', fontsize=10)\n", + "ax.set_ylabel('Sentiments' , fontsize=10)\n", + "ax.set_title('Sentiment analysis based on polarity', fontsize=10)\n", + "pols.value_counts().plot(ax=ax, kind='bar')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.000000 2314\n", + "0.500000 435\n", + "0.100000 292\n", + "1.000000 255\n", + "0.400000 231\n", + " ... \n", + "0.301667 1\n", + "0.500168 1\n", + "0.417857 1\n", + "0.343750 1\n", + "0.421429 1\n", + "Name: subjectivity, Length: 710, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.subjectivity.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "subs = cleaner.text_category(series= tweets_df.subjectivity)\n", + "subs = pd.Series(subs)\n", + "\n", + "# top 5 sources of users\n", + "fig, ax = plt.subplots()\n", + "ax.tick_params(axis='x', labelsize=10)\n", + "ax.tick_params(axis='y', labelsize=10)\n", + "ax.set_xlabel('Values', fontsize=10)\n", + "ax.set_ylabel('Subjectivity' , fontsize=10)\n", + "ax.set_title('Subjectivity values', fontsize=10)\n", + "subs.value_counts().plot(ax=ax, kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'text': 'City', 'indices': [132, 137]}]\n", + "1 [{'text': 'China', 'indices': [18, 24]}, {'tex...\n", + "2 [{'text': 'XiJinping', 'indices': [127, 137]}]\n", + "3 [{'text': 'XiJinping', 'indices': [9, 19]}]\n", + "4 []\n", + " ... \n", + "7435 [{'text': 'China', 'indices': [29, 35]}, {'tex...\n", + "7436 [{'text': 'exactly', 'indices': [29, 37]}, {'t...\n", + "7437 [{'text': 'Taiwan', 'indices': [168, 175]}, {'...\n", + "7438 [{'text': 'China', 'indices': [17, 23]}, {'tex...\n", + "7439 [{'text': 'Pelosi', 'indices': [16, 23]}]\n", + "Name: hashtags, Length: 7440, dtype: object" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[] 527\n", + "[{'text': 'Taiwan', 'indices': [0, 7]}] 62\n", + "[{'text': 'ThankYou', 'indices': [0, 9]}, {'text': 'JoeBiden', 'indices': [20, 29]}, {'text': 'Nides', 'indices': [42, 48]}, {'text': 'Pelosi', 'indices': [63, 70]}, {'text': 'IsraelHasTheRightToDefendItself', 'indices': [72, 104]}, {'text': 'IAmAGoodJew', 'indices': [107, 119]}] 20\n", + "[{'text': 'Taiwan', 'indices': [36, 43]}] 20\n", + "[{'text': 'Taiwan', 'indices': [44, 51]}] 18\n", + " ... \n", + "[{'text': 'China', 'indices': [25, 31]}, {'text': 'Taiwan', 'indices': [32, 39]}, {'text': 'US', 'indices': [40, 43]}, {'text': 'TechStocks', 'indices': [111, 122]}] 1\n", + "[{'text': 'Taiwan', 'indices': [42, 49]}, {'text': 'Chinese', 'indices': [92, 100]}, {'text': 'France', 'indices': [115, 122]}] 1\n", + "[{'text': 'Baerbock', 'indices': [0, 9]}, {'text': 'BaerbockRuecktritt', 'indices': [10, 29]}, {'text': 'pelositaiwan', 'indices': [30, 43]}, {'text': 'pelosivisittotaiwan', 'indices': [44, 64]}, {'text': 'CNN', 'indices': [186, 190]}] 1\n", + "[{'text': 'BREAKING', 'indices': [14, 23]}, {'text': 'Taiwan', 'indices': [25, 32]}, {'text': 'Chinese', 'indices': [80, 88]}] 1\n", + "[{'text': 'China', 'indices': [17, 23]}, {'text': 'Taiwan', 'indices': [45, 52]}] 1\n", + "Name: hashtags, Length: 5697, dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df.hashtags.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see there are null hashtags" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### removing null hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'text': 'City', 'indices': [132, 137]}]\n", + "1 [{'text': 'China', 'indices': [18, 24]}, {'tex...\n", + "2 [{'text': 'XiJinping', 'indices': [127, 137]}]\n", + "3 [{'text': 'XiJinping', 'indices': [9, 19]}]\n", + "4 []\n", + " ... \n", + "7435 [{'text': 'China', 'indices': [29, 35]}, {'tex...\n", + "7436 [{'text': 'exactly', 'indices': [29, 37]}, {'t...\n", + "7437 [{'text': 'Taiwan', 'indices': [168, 175]}, {'...\n", + "7438 [{'text': 'China', 'indices': [17, 23]}, {'tex...\n", + "7439 [{'text': 'Pelosi', 'indices': [16, 23]}]\n", + "Name: hashtags, Length: 7440, dtype: object" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# take the rows from that have values in the hashtag columns\n", + "hashtags_list_df = tweets_df.loc[tweets_df[\"hashtags\"] != \" \"]\n", + "hashtags_list_df = hashtags_list_df['hashtags']\n", + "hashtags_list_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### flatten the hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hashtag
0[{'text':
1'City',
2'indices':
3[132,
4137]}]
\n", + "
" + ], + "text/plain": [ + " hashtag\n", + "0 [{'text':\n", + "1 'City',\n", + "2 'indices':\n", + "3 [132,\n", + "4 137]}]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#create dataframe where each hashtag gets its own row\n", + "flattened_hashtags = []\n", + "for hashtags_list in hashtags_list_df:\n", + " hashtags_list = hashtags_list.split(\" \")\n", + " for hashtag in hashtags_list:\n", + " flattened_hashtags.append(hashtag)\n", + "flattened_hashtags_df = pd.DataFrame(flattened_hashtags, columns=['hashtag'])\n", + "flattened_hashtags_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "hashtag \n", + "'indices': 27668\n", + "{'text': 20755\n", + "[{'text': 6913\n", + "'Taiwan', 5063\n", + "'China', 2439\n", + " ... \n", + "'himalayas', 1\n", + "'OPERATIVES', 1\n", + "'historical', 1\n", + "'ONEPIECE1056', 1\n", + "'antiwhitism', 1\n", + "Length: 5709, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flattened_hashtags_df.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "flattened_hashtags_df.value_counts().head(10).plot(kind=\"pie\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### User mentions" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'screen_name': 'i_ameztoy', 'name': 'Iban Am...\n", + "1 [{'screen_name': 'IndoPac_Info', 'name': 'Indo...\n", + "2 [{'screen_name': 'ZelenskyyUa', 'name': 'Волод...\n", + "3 []\n", + "4 [{'screen_name': 'ChinaUncensored', 'name': 'C...\n", + " ... \n", + "7435 [{'screen_name': 'metesohtaoglu', 'name': 'Met...\n", + "7436 [{'screen_name': 'NEVERBOW', 'name': 'P K', 'i...\n", + "7437 [{'screen_name': 'BBCNews', 'name': 'BBC News ...\n", + "7438 []\n", + "7439 [{'screen_name': 'Reuters', 'name': 'Reuters',...\n", + "Name: user_mentions, Length: 7440, dtype: object" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_df[\"user_mentions\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [{'screen_name': 'i_ameztoy', 'name': 'Iban Am...\n", + "1 [{'screen_name': 'IndoPac_Info', 'name': 'Indo...\n", + "2 [{'screen_name': 'ZelenskyyUa', 'name': 'Волод...\n", + "3 []\n", + "4 [{'screen_name': 'ChinaUncensored', 'name': 'C...\n", + " ... \n", + "7435 [{'screen_name': 'metesohtaoglu', 'name': 'Met...\n", + "7436 [{'screen_name': 'NEVERBOW', 'name': 'P K', 'i...\n", + "7437 [{'screen_name': 'BBCNews', 'name': 'BBC News ...\n", + "7438 []\n", + "7439 [{'screen_name': 'Reuters', 'name': 'Reuters',...\n", + "Name: user_mentions, Length: 7440, dtype: object" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# take the rows from that have values in the user_mentions columns\n", + "user_mentions_list_df = tweets_df.loc[tweets_df[\"user_mentions\"] != \" \"]\n", + "user_mentions_list_df = user_mentions_list_df['user_mentions']\n", + "user_mentions_list_df" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_mentions
0[{'screen_name':
1'i_ameztoy',
2'name':
3'Iban
4Ameztoy',
\n", + "
" + ], + "text/plain": [ + " user_mentions\n", + "0 [{'screen_name':\n", + "1 'i_ameztoy',\n", + "2 'name':\n", + "3 'Iban\n", + "4 Ameztoy'," + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#create dataframe where each user_mention gets its own row\n", + "flattened_user_mentions = []\n", + "for user_mentions_list in user_mentions_list_df:\n", + " user_mentions_list = user_mentions_list.split(\" \")\n", + " for user_mentions in user_mentions_list:\n", + " flattened_user_mentions.append(user_mentions)\n", + "flattened_user_mentions_df = pd.DataFrame(flattened_user_mentions, columns=['user_mentions'])\n", + "flattened_user_mentions_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_mentions \n", + "'id': 6521\n", + "'name': 6521\n", + "'id_str': 6521\n", + "'indices': 6521\n", + "[{'screen_name': 4150\n", + " ... \n", + "'Scientists 1\n", + "'ScottLucas_EA', 1\n", + "'ScottishSun', 1\n", + "'ScottsPassage', 1\n", + "🪙', 1\n", + "Length: 15428, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flattened_user_mentions_df.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "flattened_user_mentions_df.value_counts().head(5).plot(kind=\"pie\");" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a265634967a27dd555e8346f2355ee703e655fd7f0a0d20c168527cd0a3d5707" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/pre_process.ipynb b/notebooks/pre_process.ipynb new file mode 100644 index 0000000..1d54880 --- /dev/null +++ b/notebooks/pre_process.ipynb @@ -0,0 +1,1198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "import pandas as pd\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import custom libraries and scripts\n", + "# sys.path.append(os.path.abspath(os.path.join(\"../..\")))\n", + "sys.path.append(\".\")\n", + "sys.path.append(\"..\")\n", + "\n", + "from defaults import *\n", + "from extract_dataframe import read_json\n", + "from extract_dataframe import TweetDfExtractor\n", + "from clean_tweets_dataframe import Clean_Tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atsourceoriginal_textpolaritysubjectivitylangfavorite_countstatus_countretweet_countscreen_nameoriginal_authorfollowers_countfriends_countpossibly_sensitivehashtagsuser_mentionsplace
02022-08-07 22:31:20+00:00<a href=\"http://twitter.com/download/android\" ...RT @i_ameztoy: Extra random image (I):\\n\\nLets...-1.250000e-010.190625en480972i_ameztoyi_ameztoy204972621NaN[{'text': 'City', 'indices': [132, 137]}][{'screen_name': 'i_ameztoy', 'name': 'Iban Am...
12022-08-07 22:31:16+00:00<a href=\"http://twitter.com/download/android\" ...RT @IndoPac_Info: #China's media explains the ...-1.000000e-010.100000en6915831201ZIisqZIisq65272NaN[{'text': 'China', 'indices': [18, 24]}, {'tex...[{'screen_name': 'IndoPac_Info', 'name': 'Indo...
22022-08-07 22:31:07+00:00<a href=\"http://twitter.com/download/android\" ...China even cut off communication, they don't a...0.000000e+000.000000en016270Fin21FreeFin21Free85392NaN[{'text': 'XiJinping', 'indices': [127, 137]}][{'screen_name': 'ZelenskyyUa', 'name': 'Волод...Netherlands
32022-08-07 22:31:06+00:00<a href=\"http://twitter.com/download/android\" ...Putin to #XiJinping : I told you my friend, Ta...1.000000e-010.350000en016270Fin21FreeFin21Free85392NaN[{'text': 'XiJinping', 'indices': [9, 19]}][]Netherlands
42022-08-07 22:31:04+00:00<a href=\"http://twitter.com/download/iphone\" r...RT @ChinaUncensored: I’m sorry, I thought Taiw...-6.938894e-180.556250en152118958381VizziniDoloresVizziniDolores9102608NaN[][{'screen_name': 'ChinaUncensored', 'name': 'C...Ayent, Schweiz
52022-08-07 22:31:02+00:00<a href=\"http://twitter.com/download/android\" ...RT @benedictrogers: We must not let this happe...2.000000e-010.500000en1164848336GraceCh15554845GraceCh15554845207540.0[{'text': 'Taiwan', 'indices': [84, 91]}][{'screen_name': 'benedictrogers', 'name': 'Be...Melbourne, Victoria
62022-08-07 22:30:59+00:00<a href=\"http://twitter.com/download/android\" ...RT @TGTM_Official: What kind of country can co...1.583333e-010.800000en11064173411Philipkuma1Philipkuma112264NaN[{'text': 'Taiwan', 'indices': [101, 108]}, {'...[{'screen_name': 'TGTM_Official', 'name': 'The...
72022-08-07 22:30:59+00:00<a href=\"http://twitter.com/download/android\" ...RT @ChinaInfo777: #PinkFloyd singer Roger Wate...0.000000e+000.000000en10241025nhohn2011nhohn2011870508NaN[{'text': 'PinkFloyd', 'indices': [18, 28]}, {...[{'screen_name': 'ChinaInfo777', 'name': 'Chin...Florida, USA
82022-08-07 22:30:50+00:00<a href=\"http://twitter.com/download/android\" ...RT @AmbQinGang: China's SC&amp;FM Wang Yi elab...0.000000e+000.000000en1221630239ClaudioColomaRIClaudioColomaRI127263NaN[{'text': 'Taiwan', 'indices': [80, 87]}][{'screen_name': 'AmbQinGang', 'name': 'Qin Ga...El mundo periférico
92022-08-07 22:30:45+00:00<a href=\"https://mobile.twitter.com\" rel=\"nofo...RT @CGMeifangZhang: Chinese ambassador to the ...2.000000e-010.375000en4910718825jmarzola1jmarzola1213877NaN[{'text': 'USA', 'indices': [66, 70]}, {'text'...[{'screen_name': 'CGMeifangZhang', 'name': 'Zh...
\n", + "
" + ], + "text/plain": [ + " created_at \\\n", + "0 2022-08-07 22:31:20+00:00 \n", + "1 2022-08-07 22:31:16+00:00 \n", + "2 2022-08-07 22:31:07+00:00 \n", + "3 2022-08-07 22:31:06+00:00 \n", + "4 2022-08-07 22:31:04+00:00 \n", + "5 2022-08-07 22:31:02+00:00 \n", + "6 2022-08-07 22:30:59+00:00 \n", + "7 2022-08-07 22:30:59+00:00 \n", + "8 2022-08-07 22:30:50+00:00 \n", + "9 2022-08-07 22:30:45+00:00 \n", + "\n", + " source \\\n", + "0 \n", + "Int64Index: 22000 entries, 0 to 21999\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 created_at 22000 non-null datetime64[ns, UTC]\n", + " 1 source 22000 non-null object \n", + " 2 original_text 22000 non-null object \n", + " 3 polarity 22000 non-null float64 \n", + " 4 subjectivity 22000 non-null float64 \n", + " 5 lang 22000 non-null object \n", + " 6 favorite_count 22000 non-null int64 \n", + " 7 status_count 22000 non-null int64 \n", + " 8 retweet_count 22000 non-null int64 \n", + " 9 screen_name 22000 non-null object \n", + " 10 original_author 22000 non-null object \n", + " 11 followers_count 22000 non-null int64 \n", + " 12 friends_count 22000 non-null int64 \n", + " 13 possibly_sensitive 6191 non-null float64 \n", + " 14 hashtags 22000 non-null object \n", + " 15 user_mentions 22000 non-null object \n", + " 16 place 22000 non-null object \n", + "dtypes: datetime64[ns, UTC](1), float64(3), int64(5), object(8)\n", + "memory usage: 3.0+ MB\n" + ] + } + ], + "source": [ + "global_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
polaritysubjectivityfavorite_countstatus_countretweet_countfollowers_countfriends_countpossibly_sensitive
count22000.00000022000.00000022000.0000002.200000e+0422000.0000002.200000e+0422000.0000006191.000000
mean0.0613250.283839852.1373185.446036e+04176.7501821.796764e+041563.1144550.037151
std0.2237010.2909633106.0776451.454120e+05498.4357653.030478e+054358.6512640.189146
min-1.0000000.0000000.0000001.000000e+000.0000000.000000e+000.0000000.000000
25%0.0000000.0000002.0000002.105750e+032.0000005.700000e+01137.0000000.000000
50%0.0000000.200000115.0000001.038750e+0438.0000002.840000e+02487.0000000.000000
75%0.1333330.468824655.0000004.526150e+04187.0000001.324500e+031599.0000000.000000
max1.0000001.00000065170.0000004.108317e+0617409.0000001.449852e+07208360.0000001.000000
\n", + "
" + ], + "text/plain": [ + " polarity subjectivity favorite_count status_count \\\n", + "count 22000.000000 22000.000000 22000.000000 2.200000e+04 \n", + "mean 0.061325 0.283839 852.137318 5.446036e+04 \n", + "std 0.223701 0.290963 3106.077645 1.454120e+05 \n", + "min -1.000000 0.000000 0.000000 1.000000e+00 \n", + "25% 0.000000 0.000000 2.000000 2.105750e+03 \n", + "50% 0.000000 0.200000 115.000000 1.038750e+04 \n", + "75% 0.133333 0.468824 655.000000 4.526150e+04 \n", + "max 1.000000 1.000000 65170.000000 4.108317e+06 \n", + "\n", + " retweet_count followers_count friends_count possibly_sensitive \n", + "count 22000.000000 2.200000e+04 22000.000000 6191.000000 \n", + "mean 176.750182 1.796764e+04 1563.114455 0.037151 \n", + "std 498.435765 3.030478e+05 4358.651264 0.189146 \n", + "min 0.000000 0.000000e+00 0.000000 0.000000 \n", + "25% 2.000000 5.700000e+01 137.000000 0.000000 \n", + "50% 38.000000 2.840000e+02 487.000000 0.000000 \n", + "75% 187.000000 1.324500e+03 1599.000000 0.000000 \n", + "max 17409.000000 1.449852e+07 208360.000000 1.000000 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "global_data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Remove duplicated rows" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automation in Action...!!!\n" + ] + }, + { + "data": { + "text/plain": [ + "(22000, 17)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets = Clean_Tweets(global_data)\n", + "clean_tweets.df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(22000, 17)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets.df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7440, 17)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets = clean_tweets.drop_duplicate(global_data)\n", + "clean_tweets.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see there were too many duplicates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Remove tweets that are not english" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "en 7440\n", + "Name: lang, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets.lang.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All tweets are in English" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 7440 entries, 0 to 21997\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 created_at 7440 non-null datetime64[ns, UTC]\n", + " 1 source 7440 non-null object \n", + " 2 original_text 7440 non-null object \n", + " 3 polarity 7440 non-null float64 \n", + " 4 subjectivity 7440 non-null float64 \n", + " 5 lang 7440 non-null object \n", + " 6 favorite_count 7440 non-null int64 \n", + " 7 status_count 7440 non-null int64 \n", + " 8 retweet_count 7440 non-null int64 \n", + " 9 screen_name 7440 non-null object \n", + " 10 original_author 7440 non-null object \n", + " 11 followers_count 7440 non-null int64 \n", + " 12 friends_count 7440 non-null int64 \n", + " 13 possibly_sensitive 3977 non-null float64 \n", + " 14 hashtags 7440 non-null object \n", + " 15 user_mentions 7440 non-null object \n", + " 16 place 7440 non-null object \n", + "dtypes: datetime64[ns, UTC](1), float64(3), int64(5), object(8)\n", + "memory usage: 1.0+ MB\n" + ] + } + ], + "source": [ + "clean_tweets.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Only the feature possibility sensitive have a missing value" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 3866\n", + "1.0 111\n", + "Name: possibly_sensitive, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets.possibly_sensitive.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see there are 3,866 not sensitive and 111 sensitive tweets\n", + "And only 3,988 out of 77,000 tweets are recorded for sensitivity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "created_at 0\n", + "source 0\n", + "original_text 0\n", + "polarity 0\n", + "subjectivity 0\n", + "lang 0\n", + "favorite_count 0\n", + "status_count 0\n", + "retweet_count 0\n", + "screen_name 0\n", + "original_author 0\n", + "followers_count 0\n", + "friends_count 0\n", + "possibly_sensitive 0\n", + "hashtags 0\n", + "user_mentions 0\n", + "place 0\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets = Clean_Tweets.fill_missing(clean_tweets, df = clean_tweets, column=\"possibly_sensitive\", value = \"unknown\")\n", + "clean_tweets.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',\n", + " 'lang', 'favorite_count', 'status_count', 'retweet_count',\n", + " 'screen_name', 'original_author', 'followers_count', 'friends_count',\n", + " 'possibly_sensitive', 'hashtags', 'user_mentions', 'place'],\n", + " dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_tweets.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
retweet_countsourceoriginal_texthashtagsplace
02<a href=\"http://twitter.com/download/android\" ...RT @i_ameztoy: Extra random image (I):\\n\\nLets...[{'text': 'City', 'indices': [132, 137]}]
1201<a href=\"http://twitter.com/download/android\" ...RT @IndoPac_Info: #China's media explains the ...[{'text': 'China', 'indices': [18, 24]}, {'tex...
20<a href=\"http://twitter.com/download/android\" ...China even cut off communication, they don't a...[{'text': 'XiJinping', 'indices': [127, 137]}]Netherlands
30<a href=\"http://twitter.com/download/android\" ...Putin to #XiJinping : I told you my friend, Ta...[{'text': 'XiJinping', 'indices': [9, 19]}]Netherlands
4381<a href=\"http://twitter.com/download/iphone\" r...RT @ChinaUncensored: I’m sorry, I thought Taiw...[]Ayent, Schweiz
..................
219743<a href=\"https://mobile.twitter.com\" rel=\"nofo...RT @metesohtaoglu: 📌📸 Map of #China's possible...[{'text': 'China', 'indices': [29, 35]}, {'tex...Seattle, WA
219871<a href=\"http://twitter.com/download/iphone\" r...RT @NEVERBOW: China is doing #exactly what #Ru...[{'text': 'exactly', 'indices': [29, 37]}, {'t...
219890<a href=\"http://twitter.com/download/iphone\" r...Minister Wu is crystal clear in his @BBCNews i...[{'text': 'Taiwan', 'indices': [168, 175]}, {'...Toronto, Canada
219910<a href=\"http://twitter.com/download/android\" ...Reports say that #China is planning to seize #...[{'text': 'China', 'indices': [17, 23]}, {'tex...
219970<a href=\"http://twitter.com/download/android\" ...@Reuters Thanks #Pelosi smart move.[{'text': 'Pelosi', 'indices': [16, 23]}]🇺🇲🇷🇺🇺🇦🇫🇷🇦🇪🇮🇱🏳️‍🌈
\n", + "

7440 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " retweet_count source \\\n", + "0 2
\n", + "Int64Index: 7440 entries, 0 to 21997\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 created_at 7440 non-null datetime64[ns, UTC]\n", + " 1 source 7440 non-null object \n", + " 2 original_text 7440 non-null object \n", + " 3 polarity 7440 non-null float64 \n", + " 4 subjectivity 7440 non-null float64 \n", + " 5 lang 7440 non-null object \n", + " 6 favorite_count 7440 non-null int64 \n", + " 7 status_count 7440 non-null int64 \n", + " 8 retweet_count 7440 non-null int64 \n", + " 9 screen_name 7440 non-null object \n", + " 10 original_author 7440 non-null object \n", + " 11 followers_count 7440 non-null int64 \n", + " 12 friends_count 7440 non-null int64 \n", + " 13 possibly_sensitive 7440 non-null object \n", + " 14 hashtags 7440 non-null object \n", + " 15 user_mentions 7440 non-null object \n", + " 16 place 7440 non-null object \n", + "dtypes: datetime64[ns, UTC](1), float64(2), int64(5), object(9)\n", + "memory usage: 1.0+ MB\n" + ] + } + ], + "source": [ + "clean_tweets['created_at'] = pd.to_datetime(clean_tweets['created_at'])\n", + "clean_tweets.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### extract source of tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#clean_tweets[\"source\"] = clean_tweets[\"source\"].apply(Clean_Tweets.extract_device_name(self = clean_tweets, source='source'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### save current dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "clean data saved successfully\n" + ] + } + ], + "source": [ + "clean_tweets.to_csv('../data/clean_data.csv', index = False)\n", + "print('clean data saved successfully')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a265634967a27dd555e8346f2355ee703e655fd7f0a0d20c168527cd0a3d5707" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index d017ed3..15b377b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -pandas>=1.1.0 +pandas>=1.1.0 textblob>=0.15.3 \ No newline at end of file diff --git a/tests/test_extract_dataframe.py b/tests/test_extract_dataframe.py index 8d5f30d..522c2e7 100644 --- a/tests/test_extract_dataframe.py +++ b/tests/test_extract_dataframe.py @@ -1,100 +1,251 @@ -import unittest -import pandas as pd -import sys, os - -sys.path.append(os.path.abspath(os.path.join("../.."))) - -from extract_dataframe import read_json -from extract_dataframe import TweetDfExtractor - -# For unit testing the data reading and processing codes, -# we will need about 5 tweet samples. -# Create a sample not more than 10 tweets and place it in a json file. -# Provide the path to the samples tweets file you created below -sampletweetsjsonfile = "" #put here the path to where you placed the file e.g. ./sampletweets.json. -_, tweet_list = read_json(sampletweetsjsonfile) - -columns = [ - "created_at", - "source", - "original_text", - "clean_text", - "sentiment", - "polarity", - "subjectivity", - "lang", - "favorite_count", - "retweet_count", - "original_author", - "screen_count", - "followers_count", - "friends_count", - "possibly_sensitive", - "hashtags", - "user_mentions", - "place", - "place_coord_boundaries", -] - - -class TestTweetDfExtractor(unittest.TestCase): - """ - A class for unit-testing function in the fix_clean_tweets_dataframe.py file - - Args: - ----- - unittest.TestCase this allows the new class to inherit - from the unittest module - """ - - def setUp(self) -> pd.DataFrame: - self.df = TweetDfExtractor(tweet_list[:5]) - # tweet_df = self.df.get_tweet_df() - - def test_find_statuses_count(self): - self.assertEqual( - self.df.find_statuses_count(), - ) - - def test_find_full_text(self): - text = - - self.assertEqual(self.df.find_full_text(), text) - - def test_find_sentiments(self): - self.assertEqual( - self.df.find_sentiments(self.df.find_full_text()), - ( - , - , - ), - ) - - - def test_find_screen_name(self): - name = - self.assertEqual(self.df.find_screen_name(), name) - - def test_find_followers_count(self): - f_count = - self.assertEqual(self.df.find_followers_count(), f_count) - - def test_find_friends_count(self): - friends_count = - self.assertEqual(self.df.find_friends_count(), friends_count) - - def test_find_is_sensitive(self): - self.assertEqual(self.df.is_sensitive(), ) - - - # def test_find_hashtags(self): - # self.assertEqual(self.df.find_hashtags(), ) - - # def test_find_mentions(self): - # self.assertEqual(self.df.find_mentions(), ) - - - -if __name__ == "__main__": - unittest.main() - +import os +import sys +import unittest +import pandas as pd + +# sys.path.append(os.path.abspath(os.path.join("../.."))) +# sys.path.append(".") +sys.path.append(".") +from defaults import * + +from extract_dataframe import read_json +from extract_dataframe import TweetDfExtractor + +# For unit testing the data reading and processing codes, +# we will need about 5 tweet samples. +# Create a sample not more than 10 tweets and place it in a json file. +# Provide the path to the samples tweets file you created below + +_, tweet_list = read_json(processed_global_data) + +columns = [ + "created_at", + "source", + "original_text", + "clean_text", + "sentiment", + "polarity", + "subjectivity", + "lang", + "favorite_count", + "retweet_count", + "original_author", + "screen_count", + "followers_count", + "friends_count", + "possibly_sensitive", + "hashtags", + "user_mentions", + "place", + "place_coord_boundaries", +] + + +class TestTweetDfExtractor(unittest.TestCase): + """ + A class for unit-testing function in the fix_clean_tweets_dataframe.py file + + Args: + ----- + unittest.TestCase this allows the new class to inherit + from the unittest module + """ + + def setUp(self) -> pd.DataFrame: + self.df = TweetDfExtractor(tweet_list[:5]) + # tweet_df = self.df.get_tweet_df() + + def test_find_status_count(self): + """ + Test case for the find status count method + """ + # error test + # self.assertEqual(self.df.find_statuses_count(), + # [204051, 3462, 6727, 45477, 277957]) + + # the edited error test + self.assertEqual(self.df.find_status_count(), + [40, 40, 40, 40, 40]) + + def test_find_full_text(self): + """ + Test case for hte find full text method + """ + # error test case + error_text = ['🚨Africa is "in the midst of a full-blown third wave" of coronavirus, the head of @WHOAFRO has warned\n\nCases have risen across the continent by more than 20% and deaths have also risen by 15% in the last week\n\n@jriggers reports ~ 🧵\nhttps://t.co/CRDhqPHFWM', 'Dr Moeti is head of WHO in Africa, and one of the best public health experts and leaders I know. Hers is a desperate request for vaccines to Africa. We plead with Germany and the UK to lift patent restrictions and urgently transfer technology to enable production in Africa. https://t.co/sOgIroihOc', "Thank you @research2note for creating this amazing campaign & turning social media #red4research today. @NHSRDFORUM is all about sharing the talent, passion & commitment of individuals coming together as a community for the benefit of all. You've done this. Well done 👋", 'Former Pfizer VP and Virologist, Dr. Michael Yeadon, is one of the most credentialed medical professionals speaking out about the dangers of the #Covid19 vaccines, breaks down his “list of lies” that keeps him up at night. https://t.co/LSE8CrKdqn', 'I think it’s important that we don’t sell COVAX short. It still has a lot going for it and is innovative in its design. But it needs more vaccines to share. We’re hoping our low cost @TexasChildrens recombinant protein COVID19 vaccine with @biological_e will help fill some gaps'] + + # the edited test case + text = ['RT @nikitheblogger: Irre: Annalena Baerbock sagt, es bricht ihr das Herz, dass man nicht bedingungslos schwere Waffen liefert.\nMir bricht e\u2026', + 'RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 Million \"Fl\u00fcchtlinge\" durchzuf\u00fcttern, jedoch nicht nach 16 Jahren 1 Million Rentner aus der Ar\u2026', + 'RT @Kryptonoun: @WRi007 Pharma in Lebensmitteln, Trinkwasser, in der Luft oder in der Zahnpasta irgendwo muss ein Beruhigungsmittel bzw. Be\u2026', + 'RT @WRi007: Die #Deutschen sind ein braves Volk!. Mit #Spritpreisen von 2 Euro abgefunden. Mit #inflation abgefunden. Mit h\u00f6heren #Abgaben\u2026', + 'RT @RolandTichy: Baerbock verk\u00fcndet mal so nebenhin in Riga das Ende der Energieimporte aus Russland. Habeck rudert schon zur\u00fcck, Scholz sc\u2026'] + self.assertEqual(self.df.find_full_text(), text) + + def test_find_sentiments(self): + """ + Test case for the find sentiments method + """ + # error test case + error_sentiment_values = ([0.16666666666666666, 0.13333333333333333, + 0.3166666666666667, 0.08611111111111111, + 0.27999999999999997], + [0.18888888888888888, 0.45555555555555555, + 0.48333333333333334, 0.19722222222222224, + 0.6199999999999999]) + + # the edited error test + sentiment_values = ([0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0]) + self.assertEqual(self.df.find_sentiments(self.df.find_full_text()), + sentiment_values) + + def test_find_created_time(self): + """ + Test case for the find created time method + """ + # error test case + created_at = ['Fri Jun 18 17:55:49 +0000 2021', + 'Fri Jun 18 17:55:59 +0000 2021', + 'Fri Jun 18 17:56:07 +0000 2021', + 'Fri Jun 18 17:56:10 +0000 2021', + 'Fri Jun 18 17:56:20 +0000 2021'] + + # the edited test case + really_created_at = ['Fri Apr 22 22:20:18 +0000 2022', + 'Fri Apr 22 22:19:16 +0000 2022', + 'Fri Apr 22 22:17:28 +0000 2022', + 'Fri Apr 22 22:17:20 +0000 2022', + 'Fri Apr 22 22:13:15 +0000 2022'] + self.assertEqual(self.df.find_created_time(), really_created_at) + + def test_find_source(self): + """ + Test case for the find source method + """ + # error test case + error_source = ['Twitter for iPhone', 'Twitter Web App', 'Twitter for iPhone', 'Twitter Web App', 'Twitter for Android'] + + # the edited test case + source = ['Twitter for Android', 'Twitter for Android', 'Twitter for Android', 'Twitter for Android', 'Twitter for Android'] + self.assertEqual(self.df.find_source(), source) + + def test_find_screen_name(self): + """ + Test case for the find screen name method + """ + # error test case + error_name_test_Case = ['ketuesriche', 'Grid1949', + 'LeeTomlinson8', 'RIPNY08', 'pash22'] + # the edited error test + name = ['McMc74078966', 'McMc74078966', 'McMc74078966', + 'McMc74078966', 'McMc74078966'] + self.assertEqual(self.df.find_screen_name(), name) + + def test_find_followers_count(self): + """ + Test case for the find followers count method + """ + # error test + error_f_count = [551, 66, 1195, 2666, 28250] + + # the edited error test + f_count = [3, 3, 3, 3, 3] + self.assertEqual(self.df.find_followers_count(), f_count) + + def test_find_friends_count(self): + """ + Test case for the find friends count method + """ + # error test + error_friends_count = [351, 92, 1176, 2704, 30819] + + # edited error test + friends_count = [12, 12, 12, 12, 12] + self.assertEqual(self.df.find_friends_count(), friends_count) + + def test_find_is_sensitive(self): + self.assertEqual(self.df.is_sensitive(), + [None, None, None, None, None]) + + def test_find_hashtags(self): + """ + Test case for the find hashtags method + """ + hashtags = [[], [], [], [{'indices': [16, 26], 'text': 'Deutschen'}, + {'indices': [54, 67], 'text': 'Spritpreisen'}, + {'indices': [95, 105], 'text': 'inflation'}, + {'indices': [130, 138], 'text': 'Abgaben'}], + []] + self.assertEqual(self.df.find_hashtags(), hashtags) + + def test_find_mentions(self): + """ + Test case for the find mentions method + """ + mentions = [[{"screen_name": "nikitheblogger", + "name": "Neverforgetniki", "id": 809188392089092097, + "id_str": "809188392089092097", "indices": [3, 18]}], + [{"screen_name": "sagt_mit", + "name": "Sie sagt es mit Bildern", + "id": 1511959918777184256, + "id_str": "1511959918777184256", + "indices": [3, 12]}], + [{"screen_name": "Kryptonoun", + "name": "Kryptoguru", "id": 951051508321345536, + "id_str": "951051508321345536", "indices": [3, 14]}, + {"screen_name": "WRi007", "name": "Wolfgang Berger", + "id": 1214543251283357696, + "id_str": "1214543251283357696", "indices": [16, 23]}], + [{"screen_name": "WRi007", + "name": "Wolfgang Berger", "id": 1214543251283357696, + "id_str": "1214543251283357696", "indices": [3, 10]}], + [{"screen_name": "RolandTichy", "name": "Roland Tichy", + "id": 19962363, "id_str": "19962363", "indices": [3, 15]} + ]] + self.assertEqual(self.df.find_mentions(), mentions) + + def test_find_location(self): + """ + Test case for the find location method + """ + # error test + error_locations = ['Mass', 'Edinburgh, Scotland', None, None, + 'United Kingdom'] + + # edited error test + locations = ['', '', '', '', ''] + self.assertEqual(self.df.find_location(), locations) + + def test_find_lang(self): + """ + Test case for the find lang method + """ + langs = ['de', 'de', 'de', 'de', 'de'] + self.assertEqual(self.df.find_lang(), langs) + + def test_find_retweet_count(self): + """ + Test case for the find retweet count method + """ + # error test + error_retweets_test_Case = [612, 92, 1, 899, 20] + + # the edited error test + retweets = [355, 505, 4, 332, 386] + self.assertEqual(self.df.find_retweet_count(), retweets) + + def test_find_favorite_count(self): + """ + Test case for the find favorite count method + """ + # error test + # self.assertEqual(self.df.find_favorite_count(), + # [548, 195, 2, 1580, 72]) + + # the edited error test + self.assertEqual(self.df.find_favorite_count(), + [2356, 1985, 16, 1242, 1329]) + +if __name__ == "__main__": + unittest.main()