#import data
library(readr)
data <- read_csv("C:/Users/Saniya and Family/Downloads/files/PSY 350 Class Survey (2).csv") 

#importing libraries:
library(psych)
library(effsize)
library(apaTables)
library(moments)

#The three variables to be studied are : coffee consumption, whether or not psychology was first major and consistency
#consistency is the average of 10, so we will add another column con_avg
con_data <- data[,26:35]
data$con_avg <- rowMeans(con_data, na.rm = T)

#Q1 Level of measurment:
#1. Coffee consumption: Ratio
#2. whether or not psychology was first major: Ordinal
#3. Consistency Average: Interval


#Q2.
#a)
table(data$firstmajor)
#71 are no and 101 are yes
noprop = 71/(71+101)
#proportion of no :0.4127907
yesprop= 101/(101+71)
#proportion of yes :0.5872093

#b)
#Coffee:
coffeemean = mean(data$coffee, na.rm = T) #5.970588
coffeesd= sd(data$coffee, na.rm = T) #6.926005
coffeskew=skewness(data$coffee, na.rm = T) #1.680131

#Consistency AVerage:
con_mean = mean(data$con_avg, na.rm = T) #3.68469
con_sd=sd(data$con_avg, na.rm = T) #0.6388831
con_skew=skewness(data$con_avg, na.rm = T) #-0.1082667


#Q3.
#a)
qqnorm(data$con_avg)
qqline(data$con_avg, col="blue")
#b)
#As we can see, almost all points line on the line, we can say that the data can be normally distributed.
#Also the skewness we calculated is close to 0. Hence we can say data is normally distributed

#Q4.
#a)
cohen1=cohen.d(as.numeric(data$coffee), as.factor(data$firstmajor),alpha=0.05, na.rm = T)
d_coffee= cohen1$estimate
#-0.182328
#b)
#it is a small effect
#c)
#AS we only need to test whether students drink less than average or not, one tail test should be used.
#d)
test1=t.test(data$coffee, mu=11.2)
ts_coffee = test1$statistic
df_coffee = test1$parameter
pval_coffee = test1$p.value
#test statistic =  -9.8445, df = 169, p-value < 2.2e-16

#Q5.
#a)
aggregate(data$con_avg, list(data$firstmajor), FUN=mean)
# no : 3.542254 , yes :3.784818 hence the students who has psychology as first major have more consistency
#b)
cohen2 = cohen.d(as.numeric(data$con_avg), as.factor(data$firstmajor),alpha=0.05, na.rm = T)
d_con = cohen2$estimate
#-0.3853915
#c)
#It has a medium effect
#d)
#AS we need to calculate whether mean is higher or lower, two tailed test should be used.
#e)
test2=t.test(data$con_avg[data$firstmajor=="yes"],data$con_avg[data$firstmajor=="no"])
ts_con = test2$statistic
df_con = test2$parameter
pval_con = test2$p.value
#t = 2.4836, df = 149.78, p-value = 0.01411
#f)
#Yes there is a statistically significant difference between their means