# Exam - January 22 setwd("C:\\Users\\myfti\\Desktop\\Bergamo - Lezioni\\Text Mining\\Tutorato") movies=readRDS("movies.rds") library(tidyverse) # data manipulation library(tidytext) # text mining using "tidy" # 3.a Films about avengers movies %>% filter(str_detect(Title, "Avengers")) # Search for films whose title contains “Avengers” or “Avenger”. # ne trova uno in più movies %>% filter(str_detect(Title, "Avenger(|s)")) # Is more flexible because it uses regular expressions (Avenger(|s) = singular or plural). str_subset(movies$Title,regex("avengers",ignore_case = T)) # 3.B # movies with subtitle sum(str_detect(movies$Title, ":")) # Count how many films have the colon in the title, indicative of a subtitle. # 3.c avg length by phase movies %>% group_by(Phase) %>% summarize(avg=mean(str_length(Title))) #or movies %>% mutate(len=str_length(Title)) %>% group_by(Phase) %>% summarize(avg=mean(len)) #3d moth year # This command extracts the month from the Date field (which is a string like “May 4, 2012”). #a single alphabetical character, lower case (a-z) or upper case (A-Z). #+: one or more characters (i.e. not just one letter but a whole word). # In practice: extracts the first sequence of letters in Date, which corresponds to the name of the month. movies <- movies %>% mutate(month=str_extract(Date, "[a-zA-Z]+"), year=word(Date,-1)) # extracts the last word in the Date string, i.e. the year movies # or ??stringr::word movies <- movies %>% mutate(month=str_extract(Date, "[a-zA-Z]+"), year=str_extract(Date, '\\w+$')) #It does the same thing: #it extracts the final word ($ = end of string) consisting of alphanumeric characters (\w). movies #3e popular month movies %>% count(month) %>% top_n(1) #### Topic Modeling # 4a read and tibble #It reads a file containing text associated with films (descriptions?) and transforms it into tibble. movies_topic=readRDS("moviesTopic.rds") %>% tibble() # 4b tidy data movies_topic %>% unnest_tokens(word,text) %>% #unnest_tokens: tokenize text into individual words. count() #Thus there are 1288 total words in the text, including non-significant ones such as the, and, of, etc. movies_topic %>% unnest_tokens(word,text) %>% anti_join(stop_words) %>% #Removes stop words (common words such as “the”, “and”, etc.). count() #After removing the stop words, 616 useful words remain. #This code saves the clean result (without stop words) in the tidy_movies object, which can be reused to: #count words - create a Document-Term Matrix (DTM) - do topic modelling (as you did with LDA) tidy_movies = movies_topic %>% #Save the result in tidy_movies. unnest_tokens(word,text) %>% anti_join(stop_words) # 4c Frequency in text tidy_movies %>% count(word) %>% arrange(desc(n)) # 4d Frequency in documents and dtm tidy_movies = tidy_movies %>% count(doc_id,word) dtm <- tidy_movies %>% cast_dtm(doc_id, word, n) dtm # 4e LDA library(topicmodels) movies_lda=LDA(dtm, k=4, control=list(seed=654)) #k=4: number of topics.- seed: for reproducibility. # 4f-g plot and label topics #Extracts beta probabilities (word per topic). tidy_topics <- tidy(movies_lda, matrix = "beta") #per topic per word probabilities tidy_topics #Displays the top 7 words for each topic in a bar graph. top_terms <- tidy_topics %>% group_by(topic) %>% slice_max(beta, n = 7, with_ties = F) %>% ungroup() %>% arrange(topic, -beta) top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(beta, term)) + geom_col(show.legend = F, fill = "skyblue") + facet_wrap(~topic, scales = "free") + scale_y_reordered() + theme_bw() #4h gamma and comparison #gamma: probability that a document belongs to each topic. tidy_gamma <- tidy(movies_lda, matrix = "gamma") #The dominant topic is explored for specific documents (1 and 18). tidy_gamma %>% filter(document==1) %>% arrange(desc(gamma)) tidy_gamma %>% filter(document==18) %>% arrange(desc(gamma)) #Document 1 his prevalently about the 4 topic (Godfather - it is correct). #While document 18 is prevalently about topic 3 (0.7 Avengers) and topic 1 (0.3 The Lord of Rings). #However, the document is from the Lord of Rings saga.