# 1. Random variables and distribution #### ### 1. Gaussian # Simulate a random sample (i.i.d.) from a Standard Normal distribution: X ~ N(mu = 0, sigma = 1) z <- rnorm(n = 10, mean = 0, sd = 1) # Histogram hist(z) # Descriptive statistics mean(z) sd(z) var(z) # Simulate a large random samples from a Standard Normal distribution: X ~ N(mu = 0, sigma = 1) z1 <- rnorm(n = 100, mean = 0, sd = 1) hist(z1) c(mean(z1),sd(z1),var(z1)) z2 <- rnorm(n = 1000, mean = 0, sd = 1) hist(z2) c(mean(z2),sd(z2),var(z2)) z3 <- rnorm(n = 10000, mean = 0, sd = 1) hist(z3) c(mean(z3),sd(z3),var(z3)) # Simulate a large random samples from a Normal distribution: X ~ N(mu = 20, sigma = 1) x1 <- rnorm(n = 100000, mean = 20, sd = 1) hist(x1) # Simulate a large random samples from a Normal distribution: X ~ N(mu = 20, sigma = 5) x2 <- rnorm(n = 100000, mean = 20, sd = 5) hist(x2) # Simulate a large random samples from a Normal distribution: X ~ N(mu = 30, sigma = 2) x3 <- rnorm(n = 100000, mean = 30, sd = 2) hist(x3) # Combine density plots plot(density(x1), xlim = c(5,35), col = "red", main = "Normal distributions") lines(density(x2), add = T, col = "blue") lines(density(x3), add = T, col = "orange") ### 2. Poisson # Simulate a random sample (i.i.d.) from a Poisson X ~ Pois(lambda = 2) x1 <- rpois(n = 10, lambda = 2) # Histogram plot(table(x1)) # Descriptive statistics mean(x1) sd(x1) var(x1) # Simulate a large random sample (i.i.d.) from a Poisson X ~ Pois(lambda = 2) x2 <- rpois(n = 10000, lambda = 2) # Histogram plot(table(x2)) lines(x = seq(from = min(x2), to = max(x2), by = 0.01), y = dnorm(x = seq(from = min(x2), to = max(x2), by = 0.01), mean = 1, sd = sqrt(1))*10000, col = "red", type = "l") # Descriptive statistics mean(x2) sd(x2) var(x2) # Simulate a large random sample (i.i.d.) from a Poisson X ~ Pois(lambda = 20) x3 <- rpois(n = 10000, lambda = 20) # Histogram plot(table(x3)) lines(x = seq(from = min(x3), to = max(x3), by = 0.01), y = dnorm(x = seq(from = min(x3), to = max(x3), by = 0.01), mean = 20, sd = sqrt(20))*10000, col = "red", type = "l") # Descriptive statistics mean(x3) sd(x3) var(x3) # Simulate a large random sample (i.i.d.) from a Poisson X ~ Pois(lambda = 100) x4 <- rpois(n = 10000, lambda = 100) # Histogram plot(table(x4)) lines(x = seq(from = min(x4), to = max(x4), by = 0.01), y = dnorm(x = seq(from = min(x4), to = max(x4), by = 0.01), mean = 100, sd = sqrt(100))*10000, col = "red", type = "l") # Descriptive statistics mean(x4) sd(x4) var(x4) ### 3. Bernoulli # Simulate a random sample (i.i.d.) from a Bernoulli X ~ Bern(theta = 0.2) x1 <- rbinom(n = 10, size = 1, prob = 0.2) # Histogram plot(table(x1)) # Descriptive statistics mean(x1) sd(x1) var(x1) # Simulate a random sample (i.i.d.) from a Bernoulli X ~ Bern(theta = 0.8) x2 <- rbinom(n = 10, size = 1, prob = 0.8) # Histogram plot(table(x2)) # Descriptive statistics mean(x2) sd(x2) var(x2) # Simulate a large random sample (i.i.d.) from a Bernoulli X ~ Bern(n = 10, theta = 0.8) x3 <- rbinom(n = 10000, size = 1, prob = 0.8) # Histogram plot(table(x3)) # Descriptive statistics mean(x3) sd(x3) var(x3) # Simulate a random sample (i.i.d.) from a Binomial X ~ Bin(size = 10, theta = 0.80) x4 <- rbinom(n = 10, size = 10, prob = 0.8) # Histogram plot(table(x4)) # Descriptive statistics mean(x4) sd(x4) var(x4) # Simulate a large random sample (i.i.d.) from a Binomial X ~ Bin(size = 10, theta = 0.80) x5 <- rbinom(n = 10000, size = 10, prob = 0.8) # Histogram plot(table(x5)) # Descriptive statistics mean(x5) sd(x5) var(x5) # Simulate a large random sample (i.i.d.) from a Binomial X ~ Bin(size = 400, theta = 0.80) x6 <- rbinom(n = 10000, size = 400, prob = 0.8) # Histogram plot(table(x6)) lines(x = seq(from = min(x6), to = max(x6), by = 0.01), y = dnorm(x = seq(from = min(x6), to = max(x6), by = 0.01), mean = 400*0.80, sd = sqrt(400*0.80*0.20))*10000, col = "red", type = "l") # Descriptive statistics mean(x6) sd(x6) var(x6) # Simulate a large random sample (i.i.d.) from a Binomial X ~ Bin(size = 4000, theta = 0.80) x7 <- rbinom(n = 10000, size = 4000, prob = 0.8) # Histogram plot(table(x7)) lines(x = seq(from = min(x7), to = max(x7), by = 0.01), y = dnorm(x = seq(from = min(x7), to = max(x7), by = 0.01), mean = 4000*0.80, sd = sqrt(4000*0.80*0.20))*10000, col = "red", type = "l") # Descriptive statistics mean(x7) sd(x7) var(x7) # 2. Estimators and their properties ##### ### 1. Sample mean with small size and Normal case # a. Assume our random sample comes from a Normal distribution X ~ N(mu = 5, sigma = 3) # b. Let's start with small sample n <- 10 set.seed(1) x <- rnorm(n = n, mean = 5, sd = 3) # c. Compute the sample mean mean(x) sum(x)/n # d. Compute the sampling error mean(x) - 5 # e. We know that estimators are random variables, thus the estimates change as the sample changes mean(rnorm(n = n, mean = 5, sd = 3)) mean(rnorm(n = n, mean = 5, sd = 3)) mean(rnorm(n = n, mean = 5, sd = 3)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_mean_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_10[run] <- mean(y) sampling_error_10[run] <- mean(y) - 5 } sample_mean_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_mean_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_1000[run] <- mean(y) sampling_error_1000[run] <- mean(y) - 5 } summary(sample_mean_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_mean_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_100000[run] <- mean(y) sampling_error_100000[run] <- mean(y) - 5 } summary(sample_mean_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_mean_10), xlim = c(0,10), col = "red", main = "Sample mean distribution (Gaussian case)") lines(density(sample_mean_1000), col = "blue") lines(density(sample_mean_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), xlim = c(-5,5), col = "red", main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") ### 2. Sample mean with large size and Normal case # a. Assume our random sample comes from a Normal distribution X ~ N(mu = 5, sigma = 3) # b. Let's start with large sample n <- 1000 set.seed(1) x <- rnorm(n = n, mean = 5, sd = 3) # c. Compute the sample mean mean(x) sum(x)/n # d. Compute the sampling error mean(x) - 5 # e. We know that estimators are random variables, thus the estimates change as the sample changes mean(rnorm(n = n, mean = 5, sd = 3)) mean(rnorm(n = n, mean = 5, sd = 3)) mean(rnorm(n = n, mean = 5, sd = 3)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_mean_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_10[run] <- mean(y) sampling_error_10[run] <- mean(y) - 5 } sample_mean_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_mean_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_1000[run] <- mean(y) sampling_error_1000[run] <- mean(y) - 5 } summary(sample_mean_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_mean_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_mean_100000[run] <- mean(y) sampling_error_100000[run] <- mean(y) - 5 } summary(sample_mean_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_mean_10), xlim = c(0,10), col = "red", main = "Sample mean distribution (Gaussian case)") #fissato la scala lines(density(sample_mean_1000), col = "blue") lines(density(sample_mean_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), col = "red", main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") ### 3. Sample mean with large size and Non-Normal case # a. Assume our random sample comes from a Poisson distribution X ~ Pois(lambda = 3) # b. Let's start with large sample and small lambda n <- 1000 set.seed(1) x <- rpois(n = n, lambda = 3) # c. Compute the sample mean mean(x) sum(x)/n # d. Compute the sampling error mean(x) - 3 # e. We know that estimators are random variables, thus the estimates change as the sample changes mean(rpois(n = n, lambda = 3)) mean(rpois(n = n, lambda = 3)) mean(rpois(n = n, lambda = 3)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_mean_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 3) sample_mean_10[run] <- mean(y) sampling_error_10[run] <- mean(y) - 3 } sample_mean_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_mean_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 3) sample_mean_1000[run] <- mean(y) sampling_error_1000[run] <- mean(y) - 3 } summary(sample_mean_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_mean_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 3) sample_mean_100000[run] <- mean(y) sampling_error_100000[run] <- mean(y) - 3 } summary(sample_mean_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_mean_10), xlim = c(2.5,3.5), col = "red", main = "Sample mean distribution (Gaussian case)") lines(density(sample_mean_1000), col = "blue") lines(density(sample_mean_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), col = "red", main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") ### 4. Sample mean with large size and Non-Normal case # a. Assume our random sample comes from a Poisson distribution X ~ Pois(lambda = 40) # b. Let's start with large sample and large lambda n <- 1000 set.seed(1) x <- rpois(n = n, lambda = 40) # c. Compute the sample mean mean(x) sum(x)/n # d. Compute the sampling error mean(x) - 40 # e. We know that estimators are random variables, thus the estimates change as the sample changes mean(rpois(n = n, lambda = 40)) mean(rpois(n = n, lambda = 40)) mean(rpois(n = n, lambda = 40)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_mean_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 40) sample_mean_10[run] <- mean(y) sampling_error_10[run] <- mean(y) - 40 } sample_mean_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_mean_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 40) sample_mean_1000[run] <- mean(y) sampling_error_1000[run] <- mean(y) - 40 } summary(sample_mean_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_mean_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rpois(n = n, lambda = 40) sample_mean_100000[run] <- mean(y) sampling_error_100000[run] <- mean(y) - 40 } summary(sample_mean_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_mean_10), xlim = c(38,42), ylim=c(0,2), col = "red", main = "Sample mean distribution (Gaussian case)") lines(density(sample_mean_1000), col = "blue") lines(density(sample_mean_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), col = "red", ylim=c(0,2), main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") ### 5. Sample variance with large size and Normal case # a. Assume our random sample comes from a Normal distribution X ~ N(mu = 5, sigma = 3) # b. Let's start with small sample size n <- 10 set.seed(1) x <- rnorm(n = n, mean = 5, sd = 3) # c. Compute the sample mean var(x) # d. Compute the sampling error var(x) - 3 # e. We know that estimators are random variables, thus the estimates change as the sample changes var(rnorm(n = n, mean = 5, sd = 3)) var(rnorm(n = n, mean = 5, sd = 3)) var(rnorm(n = n, mean = 5, sd = 3)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_var_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_10[run] <- var(y) sampling_error_10[run] <- var(y) - 9 } sample_var_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_var_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_1000[run] <- var(y) sampling_error_1000[run] <- var(y) - 9 } summary(sample_var_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_var_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_100000[run] <- var(y) sampling_error_100000[run] <- var(y) - 9 } summary(sample_var_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_var_10), xlim = c(0,20), col = "red", main = "Sample variance distribution (Gaussian case)") lines(density(sample_var_1000), col = "blue") lines(density(sample_var_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), col = "red", main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") ### 6. Sample variance with large size and Normal case # a. Assume our random sample comes from a Normal distribution X ~ N(mu = 5, sigma = 3) # b. Let's start with large sample size n <- 1000 set.seed(1) x <- rnorm(n = n, mean = 5, sd = 3) # c. Compute the sample mean var(x) # d. Compute the sampling error var(x) - 3 # e. We know that estimators are random variables, thus the estimates change as the sample changes var(rnorm(n = n, mean = 5, sd = 3)) var(rnorm(n = n, mean = 5, sd = 3)) var(rnorm(n = n, mean = 5, sd = 3)) # f. Try to simulate 10 random samples and to store their sample means reps <- 10 sample_var_10 <- numeric(length = reps) sampling_error_10 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_10[run] <- var(y) sampling_error_10[run] <- var(y) - 9 } sample_var_10 summary(sample_mean_10) summary(sampling_error_10) # g. What happens when the replication size increases? Let's try with 1000 replications reps <- 1000 sample_var_1000 <- numeric(length = reps) sampling_error_1000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_1000[run] <- var(y) sampling_error_1000[run] <- var(y) - 9 } summary(sample_var_1000) summary(sampling_error_1000) # h. Let's try with 100000 replications reps <- 100000 sample_var_100000 <- numeric(length = reps) sampling_error_100000 <- numeric(length = reps) for (run in 1:reps) { set.seed(run) y <- rnorm(n = n, mean = 5, sd = 3) sample_var_100000[run] <- var(y) sampling_error_100000[run] <- var(y) - 9 } summary(sample_var_100000) summary(sampling_error_100000) # i. Let's observe the empirical distributions plot(density(sample_var_10), xlim = c(5,13), ylim=c(0,1), col = "red", main = "Sample mean distribution (Gaussian case)") lines(density(sample_var_1000), col = "blue") lines(density(sample_var_100000), col = "orange") # l. Let's observe the empirical distributions of the sampling errors plot(density(sampling_error_10), col = "red", ylim=c(0,1), main = "Sampling error distribution (Gaussian case)") lines(density(sampling_error_1000), col = "blue") lines(density(sampling_error_100000), col = "orange") # 3. Input di dati #### ## 3.1. read.table(), write.csv(), save(), load(), read.csv() ##### # Creazione di una finta matrice finta_matrice <- matrix(1:1000, nrow = 100, ncol = 100) # Visualizzazione della finta matrice print("Finta Matrice:") print(finta_matrice) # Creazione di un finto dataframe finto_dataframe <- data.frame( A = 1:1000, B = sample(c("a", "b", "c", "d", "e"),1000,replace = T), stringsAsFactors = FALSE ) # Visualizzazione del finto dataframe print("Finto DataFrame:") print(finto_dataframe) # Esportazione come file Rdata save(finta_matrice, file = "data/matrice_finta.Rdata") # Esportazione come file di testo con write.table (senza nomi di riga e colonna) write.table(finta_matrice, file = "data/matrice_finta.csv", sep = ";", row.names = FALSE, col.names = FALSE) save(finta_matrice, finto_dataframe, file = "data/dati_finti.Rdata") # Esportazione come file CSV write.csv(finto_dataframe, file = "data/finto_dataframe.csv", row.names = FALSE) rm(finta_matrice,finto_dataframe) # Ri-importazione del file Rdata load("dati_finti.Rdata") # Visualizzazione della matrice e del dataframe ri-importati print("Matrice ri-importata:") print(finta_matrice) print("DataFrame ri-importato:") print(finto_dataframe) rm(finta_matrice,finto_dataframe) # Ri-importazione del file CSV finto_dataframe_csv <- read.csv("data/finto_dataframe.csv") # Visualizzazione del dataframe ri-importato print("DataFrame ri-importato da CSV:") print(finto_dataframe_csv) # Ri-importazione del file di testo con read.table matrice_finta_csv <- read.table("data/matrice_finta.csv", header = FALSE,sep = ";") # Convertire il dataframe in una matrice matrice_da_df <- as.matrix(matrice_finta_csv) # Visualizzazione della matrice ri-importata da file di testo print("Matrice ri-importata da file di testo:") print(matrice_da_df) ## 3.2. scan() #### # Chiede all'utente di inserire tre numeri separati da spazi numeri <- scan(what = numeric(), n = 3) dati <- matrix(1:10, nrow = 2, byrow = TRUE) write.table(dati, file = "data/dati.txt", sep = " ", row.names = FALSE, col.names = FALSE) numeri_da_file <- scan("data/dati.txt", what = numeric()) # Vettore di esempio contenente parole parole <- c("gatto", "cane", "pesce", "uccello") writeLines(parole, "data/parole.txt") parole <- scan("data/parole.txt", what = character()) source("Statistica_4alezione_functions.R") #import functions edit(hist_exp) #adding scan() #4. Rappresentazione grafica automatica #### hist_exp() #modified version gg_visual() #all distributions #5. Distribuzioni di probabilità da dati reali #### load("data/weather_caraj.Rdata") weather_data <- weather_data[weather_data$time < as.Date("2024-01-01"),] #boxplots ggplot(weather_data)+ geom_boxplot(aes(y=temperature,x=as.factor(year))) library(zoo) n <- 365 # Numero di osservazioni finestra1 <- 30 finestra2 <- 180 finestra3 <- 365 finestra4 <- 365 * 5 weather_data$MediaMobile <- zoo::rollmean(weather_data$temperature, k = finestra1, align = "center", fill = NA) weather_data$MediaMobile2 <- zoo::rollmean(weather_data$temperature, k = finestra2, align = "center", fill = NA) weather_data$MediaMobile3 <- zoo::rollmean(weather_data$temperature, k = finestra3, align = "center", fill = NA) weather_data$MediaMobile4 <- zoo::rollmean(weather_data$temperature, k = finestra4, align = "center", fill = NA) # Crea un grafico ggplot ggplot(data = weather_data, aes(x = time)) + geom_line(aes(y = temperature, color = "Original values"), size = .1) + geom_line(aes(y = MediaMobile, color = "MA 1 month"), size = .2) + geom_line(aes(y = MediaMobile2, color = "MA 6 months"), size = .3) + geom_line(aes(y = MediaMobile3, color = "MA 1 year"), size = .5) + geom_line(aes(y = MediaMobile4, color = "MA 5 years"), size = 1) + scale_color_manual( values = c( "Original values" = "lightblue", "MA 1 month" = "red", "MA 6 months" = "green", "MA 1 year" = "yellow", "MA 5 years" = "purple" ), name = "Legend", breaks = c( "Original values", "MA 1 month", "MA 6 months", "MA 1 year", "MA 5 years" ) ) + labs(x = "days", y = "Celsius degree") + # coord_cartesian(ylim = c(11, 16)) + # <--- focus theme_light()