# Text Mining on Coffee Posts # --- 0. Libraries --- # Load all the necessary libraries library(tidyverse) # Data manipulation and visualization library(tidytext) # Text mining following tidy data principles library(wordcloud) # To create word clouds library(widyr) # To calculate word pair correlations and co-occurrences library(igraph) # To create network graphs library(ggraph) # To visualize network graphs library(RColorBrewer) # To use color palettes for plots # --- 1. Import Dataset --- # Read the CSV file into a tibble coffee.posts <- read_csv("C:/Users/myfti/Downloads/coffee_posts_final.csv") # --- 2. Inspect the dataset --- # Look at the structure of the data to understand its columns and types glimpse(glimpse(coffee.posts)) #Alternative - if you just want to understand the structure and types of variables: head(glimpse(coffee.posts)) view(glimpse(coffee.posts)) # --- 3. Select variables post_id and text --- # Keep only the columns we need: document number and text coffee.posts <- coffee.posts |> select(n.doc, text) # --- 4. Tokenization and Stopwords Removal --- # Tokenize the text into words and remove common stopwords # unnest_tokens() splits each text into single words # anti_join(stop_words) removes uninformative common words like "the", "and" tidy.coffee <- coffee.posts |> unnest_tokens(word, text) |> anti_join(stop_words, by = "word") # --- 5. Frequency Table --- # Count the number of occurrences of each word #tidy.coffee: is the dataset where each row contains a single word (token) extracted from the posts. #count(word): counts how many times each word (word) appears in the dataset. #sort = TRUE: sorts the result in descending order of frequency (most frequently used words at the top). coffee.freq <- tidy.coffee |> count(word, sort = TRUE) # --- 6. Wordcloud --- # Create a wordcloud where word size represents frequency wordcloud(words = coffee.freq$word, freq = coffee.freq$n, max.words = 100, colors = brewer.pal(8, "Dark2")) # --- 7. Custom Stopwords --- # Define additional words to remove (specific to the context) custom_stopwords <- tibble(word = c("http", "https", "rt", "t.co", "ed", "amp", "coffee", "morning", "barista", "caffeine", "espresso", "coldbrew")) #custom_stopwords <- tibble(word = c("http", "https", "rt", "t.co", "ed", "amp", "coldbrew")) # --- 8. Further Cleaning --- # Remove custom stopwords and unwanted patterns (words starting with "00" or a single digit) #anti_join(custom_stopwords, by = “word”): Removes all words that are listed in the custom_stopwords table from the dataset. #filter(!str_detect(word, “^00”)): Deletes all words starting with 00. #filter(!str_detect(word, "^[0-9]$")): Deletes all words that are single digits from 0 to 9. tidy.coffee.2 <- tidy.coffee |> anti_join(custom_stopwords, by = "word") |> filter(!str_detect(word, "^00")) |> filter(!str_detect(word, "^[0-9]$")) # --- 9. New Frequency Table --- # Recalculate word frequencies after cleaning coffee.freq.2 <- tidy.coffee.2 |> count(word, sort = TRUE) # --- 10. New Wordcloud --- # Create another wordcloud after applying custom cleaning wordcloud(words = coffee.freq.2$word, freq = coffee.freq.2$n, max.words = 100, colors = brewer.pal(8, "Paired"), random.order = FALSE, scale = c(4, 0.7)) # --- 11. Explore Colors --- # View the first 50 color names available in R head(colors(), 50) # --- 12. Explore Color Palettes --- # Display an example palette from RColorBrewer display.brewer.pal(8, "Dark2") # --- 13. Frequency Plot --- # Create a bar plot of most frequent words coffee.freq.2 |> filter(n > 10) |> # Only words appearing more than 10 times mutate(word = reorder(word, n)) |> # Reorder for better plotting ggplot(aes(word, n, fill = word)) + geom_col(show.legend = FALSE) + coord_flip() + labs(title = "Most Frequent Words in Coffee Posts", x = NULL, y = "Word Count") # --- 14. Word Co-appearance --- # Calculate word co-occurrence within the same post #pairwise_count() is a function of the widyr package. #It is used to calculate how many times two words appear together (co-occurrence) in the same document (post_id). #sort = TRUE Sort the result by decreasing co-occurrences (most frequent first) word_pairs <- tidy.coffee.2 |> pairwise_count(word, n.doc, sort = TRUE) # View top co-occurring words head(word_pairs) # --- 15. Words co-appearing with "latte" --- # Filter to see what words are most often paired with "latte" #word_pairs is the dataset containing all word pairs that co-appear together in posts. #filter(item1 == "milk"):Selects only those rows where the first word of the pair (item1) is "milk". #In practice: we search for all words associated with the term “milk” in the documents. #arrange(desc(n)): Sorts the results from highest to lowest co-occurrence (n is the number of co-occurrences). latte_pairs <- word_pairs |> filter(item1 == "latte") |> arrange(desc(n)) # View the result latte_pairs # --- 16. Phi Coefficient for "latte" --- # Calculate word correlations based on binary co-occurrence word_cor <- tidy.coffee.2 |> pairwise_cor(word, n.doc) # Focus on words correlated with "latte" latte_cor <- word_cor |> filter(item1 == "latte") |> arrange(desc(correlation)) # View correlated words latte_cor # --- 17. Word Correlation Plot --- # Plot correlation of words with "latte" latte_cor |> slice_max(correlation, n = 10) |> ggplot(aes(x = correlation, y = reorder(item2, correlation))) + geom_point(size = 4, color = "steelblue") + labs(title = "Top Correlated Words with 'latte'", x = "Correlation", y = NULL) # Aletrnative plot_latte_correlation <- latte_cor |> slice_max(correlation, n = 10) |> ggplot(aes(x = correlation, y = reorder(item2, correlation))) + geom_point(size = 4, color = "steelblue") + labs(title = "Top Correlated Words with 'latte'", x = "Correlation", y = NULL) print(plot_latte_correlation) # --- 18. Word Network --- # Create a word network graph from co-occurrence data #word_pairs: This is the table where we have the word pairs (item1, item2) and how many times they appear together (n). #filter(n >= 5): Selects only word pairs that appear together at least 5 times. #Why? To prevent the network from being too confusing and full of weak links. network_data <- word_pairs |> filter(n >= 5) #graph_from_data_frame(network_data): Uses the igraph package to construct a graph (a network) from the network_data table. #Nodes: are the words (item1, item2). #Arcs (edges): are the connections between the words that appear together. #Arc weight: is implied by the number of co-occurrences n (how many times the words appeared together). word_network <- graph_from_data_frame(network_data) # Plot the network graph_plot <- ggraph(word_network, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), vjust = 1, hjust = 1) graph_plot #ggraph(word_network, layout = 'fr'):Starts creating a network graph (graph) using the word_network object (created before by graph_from_data_frame()). #layout = 'fr' means that it uses the Fruchterman-Reingold algorithm, which: #Arranges the nodes (words) so that neighbouring links attract and unconnected nodes repel #Result: an organic, readable network. #geom_edge_link(): Draws lines (arcs) between co-appearing words. #Each line represents a relationship between two words. #geom_node_point():Draws the points (nodes), one for each word. #geom_node_text(aes(label = name), vjust = 1, hjust = 1): Adds labels (word names) next to nodes. #vjust and hjust are used to adjust the position of the text with respect to the point (not to overlap).