[R] Writing a function to fit ALSOS models. problem with normalization?
David Hugh-Jones
davidhughjones at gmail.com
Thu Mar 23 19:14:14 CET 2006
Dear all,
Below is my attempt at a function to fit Alternate Least Squares
Optimal Scaling models, as described in Young (1981) "Quantitative
Analysis of Qualitative Data" and Jacoby (1999) "Levels of Measurement
and Political Research: An Optimistic View".
I would welcome any comments on coding style, tips & tricks etc.
I also have a specific problem: the output tends to give very small
coefficients, and very large fitted values for specific factor levels.
Am I doing the normalization right?
Cheers
David
library(car) # for "recode"
alsos.fit = function (y, x, tol = 1e-7) {
# y is a vector or factor or ordered factor
# x is a data frame of vectors/factors/ordereds
# we treat the y's as the first column of the matrix
x = cbind(y, x)
x = x[complete.cases(x),]
ox = x
x.facs = sapply(x, is.factor)
x.ords = sapply(x, is.ordered)
# start with our numeric values whatever they are
x = sapply(x, as.numeric)
old.SSE = Inf
while(T) {
# least squares regression with an intercept
ols = lm.fit(cbind(rep(1,nrow(x)), x[,-1]) , x[,1])
b = ols$coefficients
SSE = drop(ols$residuals %*% ols$residuals)
if (old.SSE-SSE<tol) {
factor.scores=list()
for (i in (1:ncol(x))[x.facs]) {
nm = colnames(x)[i]
factor.scores[[nm]] = tapply(x[,i], ox[,i],
function (foo) {return(foo[1])})
names(factor.scores[[nm]]) = levels(ox[,i])
}
return(list(
factor.scores=factor.scores,
scaled.y=x[,1], scaled.x=x[,-1],
coefficients=b, SSE=SSE,
))
}
old.SSE=SSE
mx = nx = x
mx[] = 0
nx[] = 0
for (i in (1:ncol(x))[x.facs]) {
# optimal scaling
if (i==1) nx[,i] = ols$fitted.values
else nx[,i] = (x[,1] - cbind(rep(1,nrow(x)), x[,c(-1,-i)]) %*% b[-i])/b[i]
# create within-category means
tmpfac = factor(ox[,i], labels=1:nlevels(ox[,i]))
catmeans = tapply(nx[,i], tmpfac, mean)
# ensure ordinal values are correctly ordered
if (x.ords[i]) {
tmp = kruskal.ordering(nx[,i], tmpfac)
tmpfac = tmp$tmpfac
catmeans = tmp$catmeans
}
# set values to within-category means
mx[,i] = catmeans[tmpfac]
# normalize according to Young (1981)
mx[,i] = mx[,i] * (nx[,i] %*% nx[,i]) / (mx[,i] %*% mx[,i])
}
x[,x.facs] = mx[,x.facs]
}
}
# as described in Kruskal (1964)
kruskal.ordering = function(numeric.data, tmpfac) {
j = 1
upact = T
while(T) {
catmeans = tapply(numeric.data, tmpfac, mean) # vector w as many
items as tmpfac cats
# have we finished?
if (j>nlevels(tmpfac)) return (list(catmeans=catmeans,tmpfac=tmpfac))
if ((j==nlevels(tmpfac) || catmeans[j] <= catmeans[j+1]) &&
(j==1 || catmeans[j] >= catmeans[j-1])) {
j=j+1
upact=T
next
}
if (upact) {
if (j < nlevels(tmpfac) && catmeans[j] > catmeans[j+1]) {
tmpfac = recode(tmpfac, paste(j, ":", j+1,"='", j+1, "'", sep=""))
levels(tmpfac) = 1:nlevels(tmpfac)
}
upact=F
}
else {
if (j > 1 && catmeans[j] < catmeans[j-1]) {
tmpfac = recode(tmpfac, paste(j-1, ":", j, "='", j, "'", sep=""))
levels(tmpfac) = 1:nlevels(tmpfac)
j=j-1
}
upact=T
}
}
}
More information about the R-help
mailing list