Text_Analysis_Project/Text_Analysis_Code.R at main · riccimason99/Text_Analysis_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#######################################
# QTA Final
#######################################

## Load packages
pkgTest <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[,  "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg,  dependencies = TRUE)
  sapply(pkg,  require,  character.only = TRUE)
}

library(dplyr)
library(ggplot2)

lapply(c("tidyverse",
         "guardianapi", # for working with the Guardian's API
         "quanteda", # for QTA
         "quanteda.textstats", # more Quanteda!
         "quanteda.textplots", # even more Quanteda!
         "readtext", # for reading in text data
         "stringi", # for working with character strings
         "textstem" # an alternative method for lemmatizing
), pkgTest)


########################################
####################
####################
#   ANALYSIS OF GAZA/ISREAL CORPUS
####################
####################
########################################


###
### Using the Guardian API with R
###
gu_api_key() # run this interactive function
# load in the data with dates of interest
dat <- gu_content(query = "Ukraine", from_date = "2022-01")
head(dat)
data <- dat # make a duplicate

###
### Clean up the data frame
###
data <- data[data$type == "article" & data$section_id == "world",]
head(data)
typeof(data)

data <- data[data$type == "article" & df$section_id == "world",] # see if you can subset the object to focus on the articles we want

which(duplicated(data$web_title) == TRUE) # sometimes there are duplicates...
data <- data[!duplicated(data$web_title),] # which we can remove
View(data)

#Tidy up our data, only keeping variables I need
tidy_ukr <- data %>%
  select(headline,
         date = web_publication_date, # change the name of the date variable
         wordcount,
         standfirst,
         web_title,
         body_text
  ) %>%
  mutate(date = as_datetime(date))

tidy_ukr$date <- as.Date(tidy_ukr$date)
# Check if its changed to date time
class(tidy_ukr$date)

print(tidy_ukr$date[1])

View(tidy_ukr)
summary(tid_ukr,5)

#lets check the year of 2023
start_date <- as.Date("2023-01-01")
end_date <- as.Date("2023-12-31")
ukr_filtered <- tidy_ukr[tidy_ukr$date >= start_date & tidy_ukr$date <= end_date, ]
View(ukr_filtered)

###
### Make a corpus
###

#the docvars to be included
ukr_docvars <- list(date = tidy_ukr$date,
                    word_count = tidy_ukr$wordcount)

#create corpus
corp_ukr <- corpus(x = tidy_ukr,
                   docid_field = "web_title",
                   text_field = "headline",
                   docvars = ukr_docvars)

# Lets check out some of these documents!!!
summary(corp_ukr,5)
docvars(corp_ukr[2])
print(corp_ukr[7])
print(corp_ukr[35])

ukr_corp_sum <- summary(corp_ukr,
                        n = nrows(docvars(corp_ukr)))
print(ukr_corp_sum)


###
# CHECK THE NUMBER OF ARTICLES WRITTEN
###

# Convert datetime to date
corp_ukr$date <- as.Date(corp_ukr$date)

# Filter corpus for the desired range of time
start_date <- as.Date("2023-01-01")
end_date <- as.Date("2023-12-31")
corp_filtered <- corp_ukr[corp_ukr$date >= start_date & corp_ukr$date <= end_date, ]

# Count articles per day
article_counts <- table(corp_filtered$date)

# Print the counts
print(article_counts)

# Count the occurrences of each date
date_counts <- table(corp_filtered$date)

# Plot the number of observations for each date
# based on the plot there seems to be no drop in the number of articles about Ukraine
plot(date_counts,
        main = "Number of Observations by Date",
        xlab = "Date",
        ylab = "Number of Observations",
        col = "skyblue",
        las = 2)  # Rotate x-axis labels vertically

###
# CHECK FOR DIFFERENCE IN MEANS
###
# Subset the data for 5 months before and 5 months afer the attack
five_before <- as.Date("2023-05-07")
five_after <- as.Date("2024-02-07")
ten_months_df <- tidy_ukr[tidy_ukr$date >= five_before & tidy_ukr$date <= five_after, ]

#order by date
ten_months_df <- ten_months_df[order(ten_months_df$date),]
# check to ensure we have relivant dates
max(ten_months_df$date)
min(ten_months_df$date)
View(ten_months_df)
# This is the date of the hamas attack on the music festival
specified_date <- as.Date("2023-10-07")

# Calculate the daily counts of articles before and after the specified date
daily_counts_before <- table(as.Date(ten_months_df$date[ten_months_df$date < specified_date]))
daily_counts_after <- table(as.Date(ten_months_df$date[ten_months_df$date >= specified_date]))
# Print the daily counts
print(daily_counts_before)
print(daily_counts_after)

####### Preform Test ############
# Print the results
install.packages("BSDA")
library(BSDA)
# Perform z-test for difference in means
# Perform z-test for difference in means
z_test_result <- z.test(x = daily_counts_before,
                        y = daily_counts_after,
                        alternative = "two.sided",
                        sigma.x = sd(daily_counts_before),
                        sigma.y = sd(daily_counts_after))

?z.test
# Print the results
print(z_test_result)

##################################
# Subset the data for 2 months before and 2 months afer the attack
two_before <- as.Date("2023-08-07")
two_after <- as.Date("2023-12-07")
two_month_df <- tidy_ukr[tidy_ukr$date >= one_before & tidy_ukr$date <= two_after, ]

#order by date
two_month_df <- two_month_df[order(two_month_df$date),]
# check to ensure we have relivant dates
max(two_month_df$date)
min(two_month_df$date)

# Calculate the daily counts of articles before and after the specified date
two_daily_counts_before <- table(as.Date(two_month_df$date[two_month_df$date < specified_date]))
two_daily_counts_after <- table(as.Date(two_month_df$date[two_month_df$date >= specified_date]))

####### Preform Test ############
two_z_test_result <- z.test(x = two_daily_counts_before,
                        y = one_daily_counts_after,
                        alternative = "two.sided",
                        sigma.x = sd(two_daily_counts_before),
                        sigma.y = sd(two_daily_counts_after))

# Print the results
print(two_z_test_result)


########################################
####################
####################
#   ANALYSIS OF UKRAINE CORPUS
####################
####################
########################################
##SUBSET FIRST MONTH OF DATA
first_month_ukr <- subset(tidy_ukr, date <= as.Date("2022-02-24")  & date <= "2022-03-24")
View(first_month_ukr)
length(first_month_ukr$headline)
#check for dupes
which(duplicated(first_month_ukr$headline))
#save it to my computer
write.csv(first_month_ukr, "/Users/riccimason99/Downloads/first_month_ukr.csv", row.names = FALSE)
###
#MAKE CORPUS
###

#the docvars to be included
ukr_docvars <- list(date = tidy_ukr$date,
                    word_count = tidy_ukr$wordcount)


#create corpus
first_corp_ukr <- corpus(x = first_month_ukr,
                         docid_field = "web_title",
                         text_field = "body",
                         docvars = ukr_docvars)
as.character(first_corp_ukr[[1]])


first_corp_ukr_sum <- summary(first_corp_ukr)
dim(dfm_ukr)

###
# TOKENIZE
###
token <- quanteda::tokens(first_corp_ukr,
                          remove_punct = TRUE,
                          remove_symbols = TRUE,
                          remove_url = TRUE)

is.tokens(token)
# Lowercase the text
token <- tokens_tolower(token)

print(token[10]) # print lowercase tokens from the 10th article in corpus.

stop_list <- stopwords("english") # load English stopwords from quanteda
super_stop <- c("p", "h2", "href","a", "said")
stop_list <- c(stop_list, super_stop)
head(stop_list)                   # show first 6 stopwords from stopword list.
print(stop_list)
# The tokens_remove() function allows us to apply the stop_list to our toks object
token <- tokens_remove(token, stop_list)
print(token[10])

#stemming
stem_toks <- tokens_wordstem(token)
#check it
print(token[10])
print(stem_toks[10])

#collocations
collocations <- textstat_collocations(stem_toks, size = 2)
#View(collocations)
# ii. Choose which to keep
keep_coll_list <- collocations$collocation[1:20]
keep_coll_list
comp_tok <- tokens_compound(stem_toks, keep_coll_list)

###
# Convert to dfm
###
dfm_ukr <- dfm(comp_tok)
topfeatures(dfm_ukr)


####### Sentiment Analysis

# Load the built-in sentiment dictionary
data("data_dictionary_LSD2015")

dfm_sentiment <- dfm_lookup(dfm_ukr, data_dictionary_LSD2015[1:2])
dfm_sentiment

neg <- docvars(dfm_sentiment, "prop_negative") <- as.numeric(dfm_sentiment[,1] / ntoken(dfm_sentiment))
pos <- docvars(dfm_sentiment, "prop_positive") <- as.numeric(dfm_sentiment[,2] / ntoken(dfm_sentiment))

net_sent <- (sum(pos)-sum(neg))
net_sent