[R] Writing a function to fit ALSOS models. problem with normalization?

Thu Mar 23 19:14:14 CET 2006

Dear all,

Below is my attempt at a function to fit Alternate Least Squares
Optimal Scaling models, as described in Young (1981) "Quantitative
Analysis of Qualitative Data" and Jacoby (1999) "Levels of Measurement
and Political Research: An Optimistic View".

I would welcome any comments on coding style, tips & tricks etc.

I also have a specific problem: the output tends to give very small
coefficients, and very large fitted values for specific factor levels.
Am I doing the normalization right?

Cheers
David

library(car) # for "recode"

alsos.fit = function (y, x, tol = 1e-7) {
	# y is a vector or factor or ordered factor
	# x is a data frame of vectors/factors/ordereds

	# we treat the y's as the first column of the matrix
	x = cbind(y, x)
	x = x[complete.cases(x),]
	ox = x
	x.facs = sapply(x, is.factor)
	x.ords = sapply(x, is.ordered)

	# start with our numeric values whatever they are
	x = sapply(x, as.numeric)

	old.SSE = Inf
	while(T) {
		# least squares regression with an intercept
		ols = lm.fit(cbind(rep(1,nrow(x)), x[,-1]) , x[,1])
		b = ols$coefficients
		SSE = drop(ols$residuals %*% ols$residuals)
		if (old.SSE-SSE<tol) {
			factor.scores=list()
			for (i in (1:ncol(x))[x.facs]) {
				nm = colnames(x)[i]
				factor.scores[[nm]] = tapply(x[,i], ox[,i],
						function (foo) {return(foo[1])})
				names(factor.scores[[nm]]) = levels(ox[,i])
			}

			return(list(
				factor.scores=factor.scores,
				scaled.y=x[,1], scaled.x=x[,-1],
				coefficients=b, SSE=SSE,
					))
		}
		old.SSE=SSE

		mx = nx = x
		mx[] = 0
		nx[] = 0
		for (i in (1:ncol(x))[x.facs]) {

			# optimal scaling
			if (i==1) nx[,i] = ols$fitted.values
			else nx[,i] = (x[,1] - cbind(rep(1,nrow(x)), x[,c(-1,-i)]) %*% b[-i])/b[i]

			# create within-category means
			tmpfac = factor(ox[,i], labels=1:nlevels(ox[,i]))
			catmeans = tapply(nx[,i], tmpfac, mean)

			# ensure ordinal values are correctly ordered
			if (x.ords[i]) {
				tmp = kruskal.ordering(nx[,i], tmpfac)
				tmpfac = tmp$tmpfac
				catmeans = tmp$catmeans
			}

			# set values to within-category means
			mx[,i] = catmeans[tmpfac]

			# normalize according to Young (1981)
			mx[,i] = mx[,i] * (nx[,i] %*% nx[,i]) / (mx[,i] %*% mx[,i])

		}
		x[,x.facs] = mx[,x.facs]
	}
}

# as described in Kruskal (1964)
kruskal.ordering = function(numeric.data, tmpfac) {
	j = 1
	upact = T
	while(T) {
		catmeans = tapply(numeric.data, tmpfac, mean) # vector w as many
items as tmpfac cats
		# have we finished?
		if (j>nlevels(tmpfac)) return (list(catmeans=catmeans,tmpfac=tmpfac))
		if ((j==nlevels(tmpfac) || catmeans[j] <= catmeans[j+1]) &&
				(j==1 || catmeans[j] >= catmeans[j-1])) {
			j=j+1
			upact=T
			next
		}
		if (upact) {
			if (j < nlevels(tmpfac) && catmeans[j] > catmeans[j+1]) {
				tmpfac = recode(tmpfac, paste(j, ":", j+1,"='", j+1, "'", sep=""))
				levels(tmpfac) = 1:nlevels(tmpfac)
			}
			upact=F
		}
		else {
			if (j > 1 && catmeans[j] < catmeans[j-1]) {
				tmpfac = recode(tmpfac, paste(j-1, ":", j, "='", j, "'", sep=""))
				levels(tmpfac) = 1:nlevels(tmpfac)
				j=j-1
			}
			upact=T
		}
	}
}