# Text Mining Lab1: String Manipulation in R # Duration: 90 minutes | Tool: RStudio | Required packages: tidyverse # ---------------------------------------------- # PART 1 – Basic Operations with Strings # ---------------------------------------------- library(stringr) # Exercise 1: Creating and checking types Lab1 <- c(1, 2, 3) typeof(Lab1 ) class(Lab1 ) Lab1_char <- as.character(Lab1) class(Lab1_char) # Exercise 2: Working with quotes string1 <- "This is a string with double quotes" string2 <- 'This is a string with single quotes' string3 <- 'She said, "Hello!"' string3 <- "She said, \"Hello!\"" writeLines(string3) #To see the raw contents of the string str_view(string3) #The base R function to see the raw contents of the string # Exercise 3: Length and subsetting x <- c("Home", "Trip", "Dream") #Create a vector str_length(x) #Calculates the length of each string str_sub(x, start = 1, end = 3) #Extract the first 3 letters of each str_sub(x, start = -3, end = -1) #Extract the last 3 letters using negative values. # ---------------------------------------------- # PART 2 – Combining and transforming strings # ---------------------------------------------- # Exercise 4: Personalized questions library(tidyverse) names <- tibble(name = c("Maria", "Luca", "Anna")) #Create a tibble with 3 names names <- names |> mutate(greeting = str_c("Hello ", name, " how are you?")) #Creates a new variable with vector greetings names # Exercise 5: Case conversion sentence <- "this is a title" str_to_upper(sentence) str_to_lower(sentence) str_to_title(sentence) accents <- "Vieni quà e non andare là!" str_to_upper(accents) str_to_lower(accents) str_to_title(accents) # ---------------------------------------------- # PART 3 – Separation and reorganization # ---------------------------------------------- # Exercise 6: Separate into multiple rows df1 <- tibble(x = c("apple,banana,cream", "dream,emotions", "flowers")) #Create a tibble df1 df1 |> separate_longer_delim(x, delim = ",") #to separate on different row # Exercise 7: Separate into multiple columns df2 <- tibble(x = c("A10.8.2025", "B10.10.2010", "C15.12.2015")) df2 df2 |> separate_wider_delim(x, delim = ".", names = c("code", "month", "year")) #To separate into code, edition, year # ---------------------------------------------- # PART 4 – Final Challenge # ---------------------------------------------- # Exercise 8: Creative string manipulation fruits <- c("apple", "banana", "pear", "persimmon", "kiwi", "mango", "orange") #Create a vector str_length(fruits) which.max(str_length(fruits)) #Find the fruit with the longest name lengths <- str_length(fruits) max_length <- max(lengths) max_length longest_fruit <- fruits[which.max(lengths)] longest_fruit #Pipe operator: fruits |> str_length() |> which.max() |> (\(i) fruits[i])() list_fruits <- str_c(fruits, collapse = ", ") #Creates a string with the fruits separated by commas. list_fruits fcolor <- c("red", "yellow", "green", "orange", "brown", "yellow", "orange") str_c(fruits, " is ", fcolor) #Combine each fruit with its colour: e.g. “apple is red”. #str_sub(string, start, end) <- value str_sub(fruits, 4, 4) <- "-" #Replace the fourth character in each word with the symbol fruits # ---------------------------------------------- # EXTRA – For fast finishers # ---------------------------------------------- # Function to turn c("apple", "banana", "cream) into "apple, banana, and cream" combine_nicely <- function(x) { if (length(x) <= 1) return(x) if (length(x) == 2) return(str_c(x[1], " and ", x[2])) str_c(str_c(x[-length(x)], collapse = ", "), ", and ", x[length(x)]) } #If the vector has 0 or 1 element, there is no point in adding “and” or commas, so it returns as is # If it has exactly 2 elements, it combines them with ‘ and ’ #For vectors with more than 2 elements: x[-length(x)] takes all but the last collapse = ", " joins them with commas then adds ", and " followed by the last element combine_nicely(c("apple", "banana", "cream")) #With pipe operator c("apple", "banana", "cream") |> combine_nicely() #Using tibble tibble(word = c("apple", "banana", "cream")) |> pull(word) |> combine_nicely() ################################################################################ # Example usage: combine_nicely(c("apple", "banana", "cream")) combine_nicely(c("red", "yellow", "white")) # Combine them into a longer sentence combined <- str_c( combine_nicely(c("apple", "banana", "cream")), " and ", combine_nicely(c("red", "green", "white")), " or ", combine_nicely("pink") ) combined # "apple, banana, and cream and red, green, and white or pink"