I am mostly using this post (http://www.ats.ucla.edu/stat/r/dae/logit.htm) as an example.

#get data from server #data is an example from what factors result in admission (binary) to grad school: GPA, school rank, and GRE score mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv") ## view the first few rows of the data head(mydata) #look at a summary of the data summary(mydata) #I like boxplots along with summaries, they help me boxplot(mydata$rank) #use sapply() to get the standard deviation of teh data #sapply works on a list or vector of data and returns a vector #apply() did not work on this dataset, but lapply() did, but gave a list(?) output i think #lapply(mydata, sd) sapply(mydata, sd) #now they want us to do a two-way contingency table for categorical outcomes #contingency tables are useful, but they can sometimes get cumbersome if there are lots of variables #the help reference manual for xtab() suggests using ftable() if you have lots of variables you want to look at xtabs(~admit + rank, data = mydata) #First they switch rank to be a 'factor' mydata$rank <- factor(mydata$rank) #Then they use a generalized linear model to run the logistic regression mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial") #now you can call mylogit or a summary of mylogit summary(mylogit) ##The output shows that all of the coefficients are significant. The estimate shows the log odds that a single unit change has on the outcome ##log odds are how logistic regressions express the odds of an outcome. ##odds ratios are the odds of success over odds failure (eg 80% success, .8/.2, or 4 to 1) ##log odds allow this to be expressed from a range -inf to +inf #see http://www.ats.ucla.edu/stat/mult_pkg/faq/general/odds_ratio.htm for better explanation #lets look at the confidence interval for the analysis confint.default(mylogit) #if you have a ranked item (like school rank in this example) you can use a Wald test to see if Rank is overall significant wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6) #now we can look at the coefficients as true odds ratios by raising them to exp #for example, One unit increase in GPA is 2.23X more likely to have admission exp(coef(mylogit)) #whatif you want to focus in on the effects of one coefficient? #yiou can hold the other coefficients at their mean and test out the different ranks newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))newdata1$rankP <- predict(mylogit, newdata = newdata1, type = "response") newdata1 #here we see that, GPA and GRE being average, a student from rank 1 university has a 52% chance of getting in, whereas a student rank 4 hasa 18% chance #we can look at other factors as well, like GPA and rank #first create 100 subsets between 200 and 800 GRE score newdata2 <- with(mydata, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100), 4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))#now estimate the predicted probability based on GRE and rank newdata3 <- cbind(newdata2, predict(mylogit, newdata = newdata2, type = "link", se = TRUE)) newdata3 <- within(newdata3, { PredictedProb <- plogis(fit) LL <- plogis(fit - (1.96 * se.fit)) UL <- plogis(fit + (1.96 * se.fit)) })## view first few rows of final dataset head(newdata3) #now plot using ggplot2 library(ggplot2) ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL, ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank), size = 1) #now we can look at how effective our 3 coefficient model was at predicting, and whether it was better than an empty model with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #with a chi square of 41.46 and a pvalue of 7.6e-8, this model is better than an empty model

Advertisements