# Text Mining on Coffee Posts 

# --- 0. Libraries ---
# Load all the necessary libraries
library(tidyverse)    # Data manipulation and visualization
library(tidytext)     # Text mining following tidy data principles
library(wordcloud)    # To create word clouds
library(widyr)        # To calculate word pair correlations and co-occurrences
library(igraph)       # To create network graphs
library(ggraph)       # To visualize network graphs
library(RColorBrewer) # To use color palettes for plots

# --- 1. Import Dataset ---
# Read the CSV file into a tibble
coffee.posts <- read_csv("C:/Users/myfti/Downloads/coffee_posts_final.csv")

# --- 2. Inspect the dataset ---
# Look at the structure of the data to understand its columns and types
glimpse(glimpse(coffee.posts))

#Alternative - if you just want to understand the structure and types of variables:
head(glimpse(coffee.posts))
view(glimpse(coffee.posts))

# --- 3. Select variables post_id and text ---
# Keep only the columns we need: document number and text
coffee.posts <- coffee.posts |> 
  select(n.doc, text)

# --- 4. Tokenization and Stopwords Removal ---
# Tokenize the text into words and remove common stopwords
# unnest_tokens() splits each text into single words
# anti_join(stop_words) removes uninformative common words like "the", "and"
tidy.coffee <- coffee.posts |> 
  unnest_tokens(word, text) |> 
  anti_join(stop_words, by = "word")

# --- 5. Frequency Table ---
# Count the number of occurrences of each word
#tidy.coffee: is the dataset where each row contains a single word (token) extracted from the posts.
#count(word): counts how many times each word (word) appears in the dataset.
#sort = TRUE: sorts the result in descending order of frequency (most frequently used words at the top).
coffee.freq <- tidy.coffee |> 
  count(word, sort = TRUE)

# --- 6. Wordcloud ---
# Create a wordcloud where word size represents frequency
wordcloud(words = coffee.freq$word,
          freq = coffee.freq$n,
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

# --- 7. Custom Stopwords ---
# Define additional words to remove (specific to the context)
custom_stopwords <- tibble(word = c("http", "https", "rt", "t.co", "ed", "amp", "coffee", "morning", "barista", "caffeine", "espresso", "coldbrew"))
#custom_stopwords <- tibble(word = c("http", "https", "rt", "t.co", "ed", "amp", "coldbrew"))

# --- 8. Further Cleaning ---
# Remove custom stopwords and unwanted patterns (words starting with "00" or a single digit)
#anti_join(custom_stopwords, by = “word”): Removes all words that are listed in the custom_stopwords table from the dataset.
#filter(!str_detect(word, “^00”)): Deletes all words starting with 00.
#filter(!str_detect(word, "^[0-9]$")): Deletes all words that are single digits from 0 to 9.

tidy.coffee.2 <- tidy.coffee |> 
  anti_join(custom_stopwords, by = "word") |> 
  filter(!str_detect(word, "^00")) |> 
  filter(!str_detect(word, "^[0-9]$"))

# --- 9. New Frequency Table ---
# Recalculate word frequencies after cleaning
coffee.freq.2 <- tidy.coffee.2 |> 
  count(word, sort = TRUE)

# --- 10. New Wordcloud ---
# Create another wordcloud after applying custom cleaning
wordcloud(words = coffee.freq.2$word,
          freq = coffee.freq.2$n,
          max.words = 100,
          colors = brewer.pal(8, "Paired"),
          random.order = FALSE, 
          scale = c(4, 0.7))

# --- 11. Explore Colors ---
# View the first 50 color names available in R
head(colors(), 50)

# --- 12. Explore Color Palettes ---
# Display an example palette from RColorBrewer
display.brewer.pal(8, "Dark2")

# --- 13. Frequency Plot ---
# Create a bar plot of most frequent words
coffee.freq.2 |> 
  filter(n > 10) |>                         # Only words appearing more than 10 times
  mutate(word = reorder(word, n)) |>        # Reorder for better plotting
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Most Frequent Words in Coffee Posts", x = NULL, y = "Word Count")

# --- 14. Word Co-appearance ---
# Calculate word co-occurrence within the same post
#pairwise_count() is a function of the widyr package.
#It is used to calculate how many times two words appear together (co-occurrence) in the same document (post_id).
#sort = TRUE Sort the result by decreasing co-occurrences (most frequent first)
word_pairs <- tidy.coffee.2 |> 
  pairwise_count(word, n.doc, sort = TRUE)

# View top co-occurring words
head(word_pairs)

# --- 15. Words co-appearing with "latte" ---
# Filter to see what words are most often paired with "latte"

#word_pairs is the dataset containing all word pairs that co-appear together in posts.
#filter(item1 == "milk"):Selects only those rows where the first word of the pair (item1) is "milk".
#In practice: we search for all words associated with the term “milk” in the documents.
#arrange(desc(n)): Sorts the results from highest to lowest co-occurrence (n is the number of co-occurrences). 
latte_pairs <- word_pairs |> 
  filter(item1 == "latte") |> 
  arrange(desc(n))

# View the result
latte_pairs

# --- 16. Phi Coefficient for "latte" ---
# Calculate word correlations based on binary co-occurrence
word_cor <- tidy.coffee.2 |> 
  pairwise_cor(word, n.doc)

# Focus on words correlated with "latte"
latte_cor <- word_cor |> 
  filter(item1 == "latte") |> 
  arrange(desc(correlation))

# View correlated words
latte_cor

# --- 17. Word Correlation Plot ---
# Plot correlation of words with "latte"
latte_cor |> 
  slice_max(correlation, n = 10) |> 
  ggplot(aes(x = correlation, y = reorder(item2, correlation))) +
  geom_point(size = 4, color = "steelblue") +
  labs(title = "Top Correlated Words with 'latte'", x = "Correlation", y = NULL)

# Aletrnative
plot_latte_correlation <- latte_cor |> 
  slice_max(correlation, n = 10) |> 
  ggplot(aes(x = correlation, y = reorder(item2, correlation))) +
  geom_point(size = 4, color = "steelblue") +
  labs(title = "Top Correlated Words with 'latte'", x = "Correlation", y = NULL)

print(plot_latte_correlation)

# --- 18. Word Network ---
# Create a word network graph from co-occurrence data

#word_pairs: This is the table where we have the word pairs (item1, item2) and how many times they appear together (n).
#filter(n >= 5): Selects only word pairs that appear together at least 5 times.
#Why? To prevent the network from being too confusing and full of weak links.
network_data <- word_pairs |> 
  filter(n >= 5)

#graph_from_data_frame(network_data): Uses the igraph package to construct a graph (a network) from the network_data table.
#Nodes: are the words (item1, item2).
#Arcs (edges): are the connections between the words that appear together.
#Arc weight: is implied by the number of co-occurrences n (how many times the words appeared together).
word_network <- graph_from_data_frame(network_data)

# Plot the network
graph_plot <- ggraph(word_network, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

graph_plot

#ggraph(word_network, layout = 'fr'):Starts creating a network graph (graph) using the word_network object (created before by graph_from_data_frame()).
#layout = 'fr' means that it uses the Fruchterman-Reingold algorithm, which: 
#Arranges the nodes (words) so that neighbouring links attract and unconnected nodes repel
#Result: an organic, readable network.

#geom_edge_link(): Draws lines (arcs) between co-appearing words.
#Each line represents a relationship between two words.

#geom_node_point():Draws the points (nodes), one for each word.

#geom_node_text(aes(label = name), vjust = 1, hjust = 1): Adds labels (word names) next to nodes.
#vjust and hjust are used to adjust the position of the text with respect to the point (not to overlap).