diff --git a/fix_clean_tweets_dataframe.py b/clean_tweets_dataframe.py similarity index 60% rename from fix_clean_tweets_dataframe.py rename to clean_tweets_dataframe.py index 7b45a35..f5ebcc5 100644 --- a/fix_clean_tweets_dataframe.py +++ b/clean_tweets_dataframe.py @@ -15,22 +15,33 @@ def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame: df.drop(unwanted_rows , inplace=True) df = df[df['polarity'] != 'polarity'] - return df + return + + + def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame: """ - drop duplicate rows + drop duplicate rows from the DataFrame + + Parameters + df (pd.DataFrame): Dataframe from which to remove duplicates + + Returns: + pd.DataFrame: A Dataframe with duplicates remofved """ - --- + df = df.drop_duplicates(keep='first',inplace=False) + + print("Duplicate rows removed from the DataFrame.") return df + + def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame: """ convert column to datetime """ - ---- - - ---- + df['created_at'] = pd.to_datetime(df['created_at']) df = df[df['created_at'] >= '2020-12-31' ] @@ -41,18 +52,21 @@ def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame: convert columns like polarity, subjectivity, retweet_count favorite_count etc to numbers """ - df['polarity'] = pd.---- - - ---- - ---- + df['polarity'] = pd.to_numeric(df['polarity']) + df['subjectivity'] = pd.to_numeric(df['subjectivity']) + df['retweet_count'] = pd.to_numeric(df['retweet_count']) + df['favorite_count'] = pd.to_numeric(df['favorite_count']) + df['retweet_count'] = pd.to_numeric(df['retweet_count']) + #df['followers_count'] = pd.to_numeric(df['followers_count']) return df - + + def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame: """ remove non english tweets from lang """ - df = ---- + df = pd.drop(df[df['lang'] == 'en']) return df \ No newline at end of file diff --git a/eda.ipynb b/eda.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/fix_extract_dataframe.py b/extract_dataframe.py similarity index 93% rename from fix_extract_dataframe.py rename to extract_dataframe.py index 3bd792d..51449bf 100644 --- a/fix_extract_dataframe.py +++ b/extract_dataframe.py @@ -39,7 +39,11 @@ def find_statuses_count(self)->list: statuses_count def find_full_text(self)->list: - text = + """ + This funciton gets the entire text f rom the tweet strings + """ + text = full_text + return text def find_sentiments(self, text)->list: @@ -47,8 +51,14 @@ def find_sentiments(self, text)->list: return polarity, self.subjectivity def find_created_time(self)->list: + created_at = [] + for items in self.tweets_list: + created_at.append(items.get('craeted_at',None)) return created_at + + + def find_source(self)->list: source =