From 54f074924dae35df02b8717afbb8fb242713badf Mon Sep 17 00:00:00 2001 From: abu14 Date: Tue, 21 Jan 2025 17:14:25 +0300 Subject: [PATCH 1/2] Done editing the clean tweets file --- ..._dataframe.py => clean_tweets_dataframe.py | 38 +++++++++++++------ eda.ipynb | 0 ...tract_dataframe.py => extract_dataframe.py | 0 3 files changed, 26 insertions(+), 12 deletions(-) rename fix_clean_tweets_dataframe.py => clean_tweets_dataframe.py (60%) create mode 100644 eda.ipynb rename fix_extract_dataframe.py => extract_dataframe.py (100%) diff --git a/fix_clean_tweets_dataframe.py b/clean_tweets_dataframe.py similarity index 60% rename from fix_clean_tweets_dataframe.py rename to clean_tweets_dataframe.py index 7b45a35..f5ebcc5 100644 --- a/fix_clean_tweets_dataframe.py +++ b/clean_tweets_dataframe.py @@ -15,22 +15,33 @@ def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame: df.drop(unwanted_rows , inplace=True) df = df[df['polarity'] != 'polarity'] - return df + return + + + def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame: """ - drop duplicate rows + drop duplicate rows from the DataFrame + + Parameters + df (pd.DataFrame): Dataframe from which to remove duplicates + + Returns: + pd.DataFrame: A Dataframe with duplicates remofved """ - --- + df = df.drop_duplicates(keep='first',inplace=False) + + print("Duplicate rows removed from the DataFrame.") return df + + def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame: """ convert column to datetime """ - ---- - - ---- + df['created_at'] = pd.to_datetime(df['created_at']) df = df[df['created_at'] >= '2020-12-31' ] @@ -41,18 +52,21 @@ def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame: convert columns like polarity, subjectivity, retweet_count favorite_count etc to numbers """ - df['polarity'] = pd.---- - - ---- - ---- + df['polarity'] = pd.to_numeric(df['polarity']) + df['subjectivity'] = pd.to_numeric(df['subjectivity']) + df['retweet_count'] = pd.to_numeric(df['retweet_count']) + df['favorite_count'] = pd.to_numeric(df['favorite_count']) + df['retweet_count'] = pd.to_numeric(df['retweet_count']) + #df['followers_count'] = pd.to_numeric(df['followers_count']) return df - + + def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame: """ remove non english tweets from lang """ - df = ---- + df = pd.drop(df[df['lang'] == 'en']) return df \ No newline at end of file diff --git a/eda.ipynb b/eda.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/fix_extract_dataframe.py b/extract_dataframe.py similarity index 100% rename from fix_extract_dataframe.py rename to extract_dataframe.py From 20c86cfa3c48711887faaf7a4a98f2b871e938f9 Mon Sep 17 00:00:00 2001 From: abu14 Date: Tue, 21 Jan 2025 17:33:16 +0300 Subject: [PATCH 2/2] Final Commit for name editing ranch --- extract_dataframe.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/extract_dataframe.py b/extract_dataframe.py index 3bd792d..51449bf 100644 --- a/extract_dataframe.py +++ b/extract_dataframe.py @@ -39,7 +39,11 @@ def find_statuses_count(self)->list: statuses_count def find_full_text(self)->list: - text = + """ + This funciton gets the entire text f rom the tweet strings + """ + text = full_text + return text def find_sentiments(self, text)->list: @@ -47,8 +51,14 @@ def find_sentiments(self, text)->list: return polarity, self.subjectivity def find_created_time(self)->list: + created_at = [] + for items in self.tweets_list: + created_at.append(items.get('craeted_at',None)) return created_at + + + def find_source(self)->list: source =