[R] why change days of the week from a factor to an ordered factor?
Duncan Mackay
dulcalma at bigpond.com
Tue Dec 3 03:29:32 CET 2013
Hi Bill
eg
> colours = 1:8
> coloursf = factor(1:8)
> colourso = ordered(1:8)
> str(coloursf)
Factor w/ 8 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8
> str(colourso)
Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 1 2 3 4 5 6 7 8
coloursf2 <- factor(1:8, levels = 8:1)
str(coloursf2)
Duncan
Duncan
Duncan Mackay
Department of Agronomy and Soil Science
University of New England
Armidale NSW 2351
Email: home: mackay at northnet.com.au
ordered used in
used in MASS::polr and GEE for polytomous logistic regression
-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On
Behalf Of Bill
Sent: Monday, 2 December 2013 21:24
To: r-help at r-project.org
Subject: [R] why change days of the week from a factor to an ordered factor?
I am reading the code below. It acts on a csv file called dodgers.csv with
the following variables.
> print(str(dodgers)) # check the structure of the data frame
'data.frame': 81 obs. of 12 variables:
$ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1
1 ...
$ day : int 10 11 12 13 14 15 23 24 25 27 ...
$ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345
44807 ...
$ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7
1 ...
$ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11
3 3 3 10 ...
$ temp : int 67 58 57 54 57 65 60 63 64 66 ...
$ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1
...
$ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ...
$ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
$ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
$ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ...
$ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
NULL
>
I don't understand why the author of the code decided to make the factor
days_of_week into an ordered factor. Anyone know why this should be done?
Thank you.
Here is the code:
# Predictive Model for Los Angeles Dodgers Promotion and Attendance
library(car) # special functions for linear regression
library(lattice) # graphics package
# read in data and create a data frame called dodgers dodgers <-
read.csv("dodgers.csv")
print(str(dodgers)) # check the structure of the data frame
# define an ordered day-of-week variable # for plots and data summaries
dodgers$ordered_day_of_week <- with(data=dodgers,
ifelse ((day_of_week == "Monday"),1,
ifelse ((day_of_week == "Tuesday"),2,
ifelse ((day_of_week == "Wednesday"),3,
ifelse ((day_of_week == "Thursday"),4,
ifelse ((day_of_week == "Friday"),5,
ifelse ((day_of_week == "Saturday"),6,7))))))) dodgers$ordered_day_of_week
<- factor(dodgers$ordered_day_of_week,
levels=1:7,
labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))
# exploratory data analysis with standard graphics: attendance by day of
week with(data=dodgers,plot(ordered_day_of_week, attend/1000, xlab = "Day of
Week", ylab = "Attendance (thousands)", col = "violet", las = 1))
# when do the Dodgers use bobblehead promotions with(dodgers,
table(bobblehead,ordered_day_of_week)) # bobbleheads on Tuesday
# define an ordered month variable
# for plots and data summaries
dodgers$ordered_month <- with(data=dodgers,
ifelse ((month == "APR"),4,
ifelse ((month == "MAY"),5,
ifelse ((month == "JUN"),6,
ifelse ((month == "JUL"),7,
ifelse ((month == "AUG"),8,
ifelse ((month == "SEP"),9,10)))))))
dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10, labels =
c("April", "May", "June", "July", "Aug", "Sept", "Oct"))
# exploratory data analysis with standard R graphics: attendance by month
with(data=dodgers,plot(ordered_month,attend/1000, xlab = "Month", ylab =
"Attendance (thousands)", col = "light blue", las = 1))
# exploratory data analysis displaying many variables # looking at
attendance and conditioning on day/night # the skies and whether or not
fireworks are displayed
library(lattice) # used for plotting
# let us prepare a graphical summary of the dodgers data group.labels <-
c("No Fireworks","Fireworks") group.symbols <- c(21,24) group.colors <-
c("black","black") group.fill <- c("black","red")
xyplot(attend/1000 ~ temp | skies + day_night,
data = dodgers, groups = fireworks, pch = group.symbols,
aspect = 1, cex = 1.5, col = group.colors, fill = group.fill,
layout = c(2, 2), type = c("p","g"),
strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1),
xlab = "Temperature (Degrees Fahrenheit)",
ylab = "Attendance (thousands)",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols), col = rev(group.colors),
fill = rev(group.fill))))
# attendance by opponent and day/night game group.labels <- c("Day","Night")
group.symbols <- c(1,20) group.symbols.size <- c(2,2.75) bwplot(opponent ~
attend/1000, data = dodgers, groups = day_night,
xlab = "Attendance (thousands)",
panel = function(x, y, groups, subscripts, ...)
{panel.grid(h = (length(levels(dodgers$opponent)) - 1), v = -1)
panel.stripplot(x, y, groups = groups, subscripts = subscripts,
cex = group.symbols.size, pch = group.symbols, col = "darkblue")
},
key = list(space = "top",
text = list(group.labels,col = "black"),
points = list(pch = group.symbols, cex = group.symbols.size,
col = "darkblue")))
# specify a simple model with bobblehead entered last my.model <- {attend ~
ordered_month + ordered_day_of_week + bobblehead}
# employ a training-and-test regimen
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))),
rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers)))))
dodgers$training_test <- sample(training_test) # random permutation
dodgers$training_test <- factor(dodgers$training_test,
levels=c(1,2), labels=c("TRAIN","TEST")) dodgers.train <- subset(dodgers,
training_test == "TRAIN")
print(str(dodgers.train)) # check training data frame dodgers.test <-
subset(dodgers, training_test == "TEST")
print(str(dodgers.test)) # check test data frame
# fit the model to the training set
train.model.fit <- lm(my.model, data = dodgers.train) # obtain predictions
from the training set dodgers.train$predict_attend <-
predict(train.model.fit)
# evaluate the fitted model on the test set dodgers.test$predict_attend <-
predict(train.model.fit,
newdata = dodgers.test)
# compute the proportion of response variance # accounted for when
predicting out-of-sample cat("\n","Proportion of Test Set Variance Accounted
for: ", round((with(dodgers.test,cor(attend,predict_attend)^2)),
digits=3),"\n",sep="")
# merge the training and test sets for plotting dodgers.plotting.frame <-
rbind(dodgers.train,dodgers.test)
# generate predictive modeling visual for management group.labels <- c("No
Bobbleheads","Bobbleheads") group.symbols <- c(21,24) group.colors <-
c("black","black") group.fill <- c("black","red")
xyplot(predict_attend/1000 ~ attend/1000 | training_test,
data = dodgers.plotting.frame, groups = bobblehead, cex = 2,
pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(20,65), ylim = c(20,65),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(25,25,60,60,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))
# use the full data set to obtain an estimate of the increase in #
attendance due to bobbleheads, controlling for other factors my.model.fit <-
lm(my.model, data = dodgers) # use all available data
print(summary(my.model.fit))
# tests statistical significance of the bobblehead promotion # type I anova
computes sums of squares for sequential tests
print(anova(my.model.fit))
cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ",
round(my.model.fit$coefficients[length(my.model.fit$coefficients)],
digits = 0),"\n",sep="")
# standard graphics provide diagnostic plots
plot(my.model.fit)
# additional model diagnostics drawn from the car package
library(car)
residualPlots(my.model.fit)
marginalModelPlots(my.model.fit)
print(outlierTest(my.model.fit))
[[alternative HTML version deleted]]
______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
More information about the R-help
mailing list