# Exam - January 22

setwd("C:\\Users\\myfti\\Desktop\\Bergamo - Lezioni\\Text Mining\\Tutorato")

movies=readRDS("movies.rds")

library(tidyverse) # data manipulation
library(tidytext) # text mining using "tidy"

# 3.a Films about avengers

movies %>%
  filter(str_detect(Title, "Avengers")) # Search for films whose title contains “Avengers” or “Avenger”.

# ne trova uno in più
movies %>%
  filter(str_detect(Title, "Avenger(|s)")) # Is more flexible because it uses regular expressions (Avenger(|s) = singular or plural).

str_subset(movies$Title,regex("avengers",ignore_case = T))

# 3.B # movies with subtitle

sum(str_detect(movies$Title, ":")) # Count how many films have the colon in the title, indicative of a subtitle.

# 3.c avg length by phase

movies %>% 
  group_by(Phase) %>% 
  summarize(avg=mean(str_length(Title)))

#or
movies %>% 
  mutate(len=str_length(Title)) %>% 
  group_by(Phase) %>% 
  summarize(avg=mean(len))

#3d moth year
# This command extracts the month from the Date field (which is a string like “May 4, 2012”).
#a single alphabetical character, lower case (a-z) or upper case (A-Z).
#+: one or more characters (i.e. not just one letter but a whole word).
# In practice: extracts the first sequence of letters in Date, which corresponds to the name of the month.
movies <- movies %>% 
  mutate(month=str_extract(Date, "[a-zA-Z]+"),
         year=word(Date,-1)) #  extracts the last word in the Date string, i.e. the year
movies 
  
# or
??stringr::word
movies <- movies %>% 
  mutate(month=str_extract(Date, "[a-zA-Z]+"),
         year=str_extract(Date, '\\w+$')) #It does the same thing: 
#it extracts the final word ($ = end of string) consisting of alphanumeric characters (\w).

movies 

#3e popular month

movies %>% count(month) %>% top_n(1)


#### Topic Modeling

# 4a read and tibble
#It reads a file containing text associated with films (descriptions?) and transforms it into tibble.
movies_topic=readRDS("moviesTopic.rds") %>% tibble()

# 4b  tidy data
movies_topic %>% 
  unnest_tokens(word,text) %>% #unnest_tokens: tokenize text into individual words.
  count() #Thus there are 1288 total words in the text, including non-significant ones such as the, and, of, etc.

movies_topic %>% 
  unnest_tokens(word,text) %>%
  anti_join(stop_words) %>%  #Removes stop words (common words such as “the”, “and”, etc.).
  count() #After removing the stop words, 616 useful words remain.

#This code saves the clean result (without stop words) in the tidy_movies object, which can be reused to: 
#count words - create a Document-Term Matrix (DTM) - do topic modelling (as you did with LDA)
tidy_movies = movies_topic %>% #Save the result in tidy_movies.
  unnest_tokens(word,text) %>% 
  anti_join(stop_words) 


# 4c Frequency in text
tidy_movies %>% 
  count(word) %>% 
  arrange(desc(n))


# 4d Frequency in documents and dtm
tidy_movies = tidy_movies %>% 
 count(doc_id,word) 

dtm <- tidy_movies %>%
  cast_dtm(doc_id, word, n)

dtm


# 4e LDA
library(topicmodels)
movies_lda=LDA(dtm, k=4, control=list(seed=654)) #k=4: number of topics.- seed: for reproducibility.



# 4f-g plot and label topics
#Extracts beta probabilities (word per topic).
tidy_topics <- tidy(movies_lda, matrix = "beta")  #per topic per word probabilities

tidy_topics

#Displays the top 7 words for each topic in a bar graph.
top_terms <- tidy_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 7, with_ties = F) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term)) + 
  geom_col(show.legend = F, fill = "skyblue") +
  facet_wrap(~topic, scales = "free") + 
  scale_y_reordered() +
  theme_bw()



#4h gamma and comparison
#gamma: probability that a document belongs to each topic.
tidy_gamma <- tidy(movies_lda, matrix = "gamma") 

#The dominant topic is explored for specific documents (1 and 18).
tidy_gamma %>% filter(document==1) %>% arrange(desc(gamma))
tidy_gamma %>% filter(document==18) %>% arrange(desc(gamma))

#Document 1 his prevalently about the 4 topic (Godfather - it is correct). 
#While document 18 is prevalently about topic 3 (0.7 Avengers) and topic 1 (0.3 The Lord of Rings). 
#However, the document is from the Lord of Rings saga.