setwd('C:\\Users\\v\\Documents\\My Documents full backup\\ODU\\Modern topics seminar\\Day 03')
library(lme4) # load library
######################################
#GLM
#http://www.theanalysisfactor.com/glm-r-overdispersion-count-regression/
#Counts of students diagnosed as infected, number of days from initial outbreak
cases=read.csv("cases example.csv")
attach(cases)
head(cases)
#use defaul link function (natural log)
model1 <- glm(Students ~ Days, family=poisson, data=cases)
summary(model1)
#The negative coefficient for Days indicates that as days increase, the mean number of students
#with the disease is smaller. This coefficient is highly significant (p < 2e-16).
#If the residual deviance were greater than the degrees of freedom,
#there would be over-dispersion. This means that there is extra variance not accounted for
#by the model or by the error structure. This is a very important model assumption,
#so would have to re-fit the model using quasi poisson errors.
#allows overdispersion parameter to be estimated
model2 <- glm(Students ~ Days, family=quasipoisson, data=cases)
summary(model2)
#dispersion parameter is less than 1, so no overdispersion
#use model 1
summary(model1) #OR
model1$coefficients
#We can use the residual deviance to perform a goodness of fit test for the overall model.
#The residual deviance is the difference between the deviance of the current model and
#the maximum deviance of the ideal model where the predicted values are identical to the observed.
#Therefore, if the residual difference is small enough, the goodness of fit test will
#not be significant, indicating that the model fits the data. We conclude that the model
#fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant.
#If the test had been statistically significant, it would indicate that the data do not fit the model well.
#In that situation, we may try to determine if there are omitted predictor variables,
#if our linearity assumption holds and/or if there is an issue of over-dispersion.
with(model1, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
#check model with diagnostic plots
plot(model1)
Y <- predict(model1) #in log scale
Y2 <- exp(Y) #exponentiate of the fitted values to back-transform log scale to original data.
write.csv(Y2, "Y2 exp.csv")
#or use type="response"
Z <- predict(model1, type="response")
plot(Days, Students, xlab = "DAYS", ylab = "STUDENTS", pch = 16)
lines(Y2, lwd = 3, col = "blue")
lines(Z, lwd = 3, col = "red")
#Let's calculate the impact on the number of cases arising from a one day increase along the time axis. First we take the exponential of the coefficients.
coeffs <- exp(coef(model1))
coeffs
#We calculate the 95% confidence interval (upper and lower confidence limits) as follows:
CI <- exp(confint.default(model1))
CI
#We can calculate the change in number of students presenting with the disease for each additional day, as follows:
1 - 0.9826884
#The reduction (rate ratio) is approximately 0.02 cases for each additional day.
detach(cases)
####################################
#######################################
####GLMER
#####simple repeated measures random effects model
#Frequently we are not as interested in comparing the
#particular subjects in the study as much as we are interested
#in modeling the variability in the population from which the
#subjects were chosen.
#the individual or block which is repeatedly measured is the random effect
#nuisance variable that must be included
library(npmlreg)
library(car)
data(irlsuicide)
head(irlsuicide)
attach(irlsuicide)
#Region : 13 different health regions of Ireland.
#ID : factor with levels 1 2 3 . 12 13 corresponding to Regions.
#pop : a numeric vector giving the population size of each suppopulation group.
#death :a numeric vector giving the total number of deaths of each suppopulation group.
#sex : a factor for gender with levels 0 (female) and 1 (male).
#age : a factor for age with levels 1 (0-29), 2 (30-39), 3 (40-59), 4(60+ years).
#smr : a numeric vector with standardized mortality ratios (SMRs)
#expected : a numeric vector with `expected' number of cases obtained from a reference population (Ahlbom, 1993).
#Response is "death", a count
#examine Death by Region (the random factor)
plot(death ~ Region, data=irlsuicide, pch=16,col="lightblue")
#examine the data
hist(death)
#skewed, all positivie, Poisson distribution works
#quick tables of the data
(xtabs(death~ Region, data = irlsuicide))
#How do sex and age affect deaths, have to accout for Region
# random intercept - The intercept for Region can vary. Sex and age are fixed.
m1_res <-glmer(death ~ (1|Region) + sex + age, family= poisson, data = irlsuicide)
summary(m1_res)
#The assumption of Poisson distribution is that the variance is equal to the constant mean.
#So the residuals have the same constant homogeneous assumption. If the data fits to a
#Poisson distribution, then dispersion should be 1. However, dispersion can differ from 1
#when there is a random effect or there are clusters of observations.
plot(m1_res)
#overdispersion is if its much bigger than 1
library(RVAideMemoire)
overdisp.glmer(m1_res)
#The residual plot shows the nonhomogeneous variace property of the Irish Suicide data.
#The overdispersion calculaton also returns greater than 1.
#Therefore, further analyses such as examining over-dispersion, using other distributions,
#centering the variables, or transformation of variables can be considered
#for better model fitting without overdispersion problem.
#Note:
##One ***possible*** approach is to create an observation-level random effect
#Create a new dataset with one level per observation
X=1:104
newirlsuicide=cbind(X,irlsuicide)
#include that new variable as a random effect
m2_res <-glmer(death ~ (1|Region)+(1|X) + sex + age , family= poisson, data = newirlsuicide)
summary(m2_res)
#test overdispersion again
overdisp.glmer(m2_res)
#You'd have to investigate if this were really approapriate for your issue.
#Other approaches are possible.
detach(irlsuicide)