[R] Strange behavior when trying to piggyback off of "fitdistr"

Tue Jan 12 23:01:06 CET 2010

Hello.

I am not certain even how to search the archives for this particular question, so if there is an obvious answer, please smack me with a large halibut and send me to the URLs.

I have been experimenting with fitting curves by using both maximum likelihood and maximum spacing estimation techniques. Originally, I have been writing distribution-specific functions in 'R' which work rather well. As the procedure is identical for all distributions, other than the actual distribution function itself, I thought I would try to build a single function that accepted the distribution as an input and returned the results. Never having played with calls, formals, and arguments before, I figured I would rip off, cough, cough, be inspired by the venerable "fitdistr" in MASS, which accepts distribution inputs.

After a few hours, I actually got it working decently (although unfinished). However, I am finding something very weird. At its core, the technique requires the difference between the value of the cumulative distribution function at neighboring evaluations. I implement this by running p($DIST) on the vector of sorted losses (call it SP), creating two new vectors, one c(0, SP) and one c(SP, 1), and then taking the latter minus the former. If there happen to be two instances of the same value, unless it is known rounding error, one substitutes the density at that point for the difference in the cumulative distribution (which would be 0, as the CDF of two identical values is the same). So, I run d($DIST), add a 0 in front to make it the same length, and return a new vector equal to pmax.int(DIFFERECEN, DENSITY), with the idea that the density is always >0 and always less than the difference in cumulative distributions, so it will only be max in the case of DIFFERENCE being truly 0. I then take negative the sum of the log of the differences and that is the function passed to optim.

What is weird is when I leave out the density correction (which is safe 99.9999% of the time as the chances of two identical losses is almost 0 (assuming no clustering/capping) ), I get a very similar result to my distribution-customized function which calls the proper plnorm or pgenpareto directly. When I add in the correction, the value is orders of magnitude higher, which not only affects the fit (slightly) but also affects the goodness of fit statistics. I have no idea why this happens, although in theory, if the function is pulling too many density values, it would return a higher value as the densities are much closer to 0 so the neg-log is a larger number.

In the code pasted below, if "spacing" returns -sum(log(SP2), it works fine. If it returns -(sum(log(SP3)), it gives strange results.

I do not have the S programming language book (perhaps I should invest in it) and the online help wasn't that helpful to me, so I would very much appreciate any responses y'all may have.

Thank you very much,

--Avi

#####################################################
Code (Unfinished)

MSEFit <- function (x, distfun, start, ...)
{
    require (MASS); require (actuar);
    Call <- match.call(expand.dots = TRUE)
    if (missing(start))
        start <- NULL
    dots <- names(list(...))
    if (missing(x) || length(x) == 0L || mode(x) != "numeric") 
        stop("'x' must be a non-empty numeric vector")
    if (any(!is.finite(x))) 
        stop("'x' contains missing or infinite values")
    if (missing(distfun) || !(is.function(distfun) || is.character(distfun))) 
        stop("'density' must be supplied as a function or name")
    n <- length(x)
    if (is.character(distfun)) {
        distname <- tolower(distfun)
        densfun <- switch(distname, exp = dexp, exponential = dexp, gamma = dgamma,
            `log-normal` = dlnorm, lnorm = dlnorm, lognormal = dlnorm, weibull = dweibull,
            pareto = dpareto, loglogistic = dllogis, transbeta = dtrbeta,
            `transformed beta` = dtrbera, burr = dburr, paralogistic = dparalogis,
            genpareto = dgenpareto, generalizedpareto = dgenpareto,
            `generalized pareto` = dgenpareto, invburr = dinvburr, 
            `inverse burr` = dinvburr, invpareto = dinvpareto, `inverse pareto` = dinvpareto,
            invparalogistic = dinvparalogis, `inverse paralogistic` = dinvparalogis,
            transgamma = dtrgamma, `transformed gamma` = dtrgamma, invexp = dinvexp,
            `inverse exponential` = dinvexp, invtransgamma = dinvtrgamma,
            `inverse transformed gamma` = dinvtrgamma, invgamma = dinvgamma,
            `inverse gamma` = dinvgamma, invweibull = dinvweibull, `inverse weibull` = dinvweibull,
            loggamma = dlgamma, genbeta = dgenbeta, `generalized beta` = dgenbeta, NULL)
        if (is.null(densfun)) 
            stop("unsupported distribution")            
        distfun <- switch(distname, exp = pexp, exponential = pexp, gamma = pgamma,
            `log-normal` = plnorm, lnorm = plnorm, lognormal = plnorm, weibull = pweibull,
            pareto = ppareto, loglogistic = pllogis, transbeta = ptrbeta,
            `transformed beta` = ptrbera, burr = pburr, paralogistic = pparalogis,
            genpareto = pgenpareto, generalizedpareto = pgenpareto,
            `generalized pareto` = pgenpareto, invburr = pinvburr, 
            `inverse burr` = pinvburr, invpareto = pinvpareto, `inverse pareto` = pinvpareto,
            invparalogistic = pinvparalogis, `inverse paralogistic` = pinvparalogis,
            transgamma = ptrgamma, `transformed gamma` = ptrgamma, invexp = pinvexp,
            `inverse exponential` = pinvexp, invtransgamma = pinvtrgamma,
            `inverse transformed gamma` = pinvtrgamma, invgamma = pinvgamma,
            `inverse gamma` = pinvgamma, invweibull = pinvweibull, `inverse weibull` = pinvweibull,
            loggamma = plgamma, genbeta = pgenbeta, `generalized beta` = pgenbeta, NULL)
        if (distname %in% c("lognormal", "log-normal", "lnorm")) {
            if (!is.null(start)) 
                stop("supplying pars for the log-Normal is not supported")
            if (any(x <= 0)) 
                stop("need positive values to fit a log-Normal")
            mx <- mean(log(x))
            sx <- sd(log(x))
            start <- list(meanlog = mx, sdlog = sx)
            start <- start[!is.element(names(start), dots)]
        }
        if (distname == "exp" && is.null(start)) {
            mu <- mean(x)
            start <- list(rate = 1/mu)
            start <- start[!is.element(names(start), dots)]
        }    
        if (distname == "gamma" && is.null(start)) {
            m <- mean(x)
            t <- sum(x^2)/n
            shape <- m^2/(t-m^2)
            scale <- (t-m^2)/m
            start <- list(shape = shape, scale = scale)
            start <- start[!is.element(names(start), dots)]
        }    
        if (distname == "weibull" && is.null(start)) {
            p <- quantile (x,.25)
            q <- quantile(x, .75)
            g <- log(log(4))/log(log(4/3))
            theta <- (g*log(p) - log(q))/(g-1)
            tau <- log(log(4))/(log(q) - log(theta))
            start <- list(shape = tau, scale = theta)
            start <- start[!is.element(names(start), dots)]
        }    
        if (distname %in% c("invweibull", "inverse weibull") && is.null(start)) {
            p <- quantile (x,.25)
            q <- quantile(x, .75)
            g <- log(log(4))/log(log(4/3))
            theta <- (g*log(q) - log(p))/(g-1)
            tau <- log(log(4))/(log(theta) - log(p))
            start <- list(shape = tau, scale = theta)
            start <- start[!is.element(names(start), dots)]
        }  
        if (distname %in% c("pareto", "paralogistic") && is.null(start)) {
            m <- mean(x)
            t <- sum(x^2)/n
            shape <- 2*(t-m^2)/(t-2*m^2)
            scale <- (m*t)/(t-2*m^2)
            start <- list(shape = shape, scale = scale)
            start <- start[!is.element(names(start), dots)]
        }
        if (distname %in% c("invpareto", "inverse pareto") && is.null(start)) {
            m <- mean(x)
            t <- sum(x^2)/n
            alpha <- 2*(t-m^2)/(t-2*m^2)
            scale <- (m*t)/(t-2*m^2)
            start <- list(shape = 1/alpha, scale = scale)
            start <- start[!is.element(names(start), dots)]
        } 
        if (distname == "loglogistic" && is.null(start)) {
            p <- quantile (x,.25)
            q <- quantile(x, .75)
            shape <- 2*log(3)/(log(q)-log(p))
            scale <- exp((log(q)+log(p))/2)
            start <- list(shape = shape, scale = scale)
            start <- start[!is.element(names(start), dots)]
        } 
        if (distname %in% c("transbeta", "transformed beta") && is.null(start)) {
            m <- mean(x)
            v <- var (x)
            tau <- 1
            beta <- 2
            alpha <- max(((v+m^2)*beta)/(v*beta-m^2),2.1)
            theta <- (alpha-1)*m/beta
            start <- list(shape1 = alpha, shape2 = beta, shape3 = tau, scale = theta)
            start <- start[!is.element(names(start), dots)]
        } 
        if (distname == "burr" && is.null(start)) {
            m <- mean(x)
            start <- list(shape1 = 3, shape2= .5, scale = m)
            start <- start[!is.element(names(start), dots)]
        }
         if (distname %in% c("invburr", "inverse burr") && is.null(start)) {
            m <- mean(x)
            start <- list(shape1 = 1, shape2= 3, scale = m)
            start <- start[!is.element(names(start), dots)]
        } 
        if (distname %in% c("genpareto", "generalized pareto", "generalizedpareto") && is.null(start)) {
            m <- mean(x)
            v <- var (x)
            tau <- 2
            alpha <- max(((v+m^2)*tau)/(v*tau-m^2),2.1)
            theta <- (alpha-1)*m/tau
            start <- list(shape1 = alpha, shape2= tau, scale = theta)
        }
    }
######################################## (FINISH STARTING VALUES LATER) ############   
    if (is.null(start) || !is.list(start))
        stop("'start' must be a named list")
    nm <- names(start)
    f <- formals(distfun)
    args <- names(f)
    m <- match(nm, args)
    if (any(is.na(m))) 
        stop("'start' specifies names which are not arguments to 'distfun'")
    formals(densfun) <- c(f[c(1, m)], f[-c(1, m)])   
    formals(distfun) <- c(f[c(1, m)], f[-c(1, m)])
    denss <- function (parm, x, ...) densfun (x, parm, ...)
    if ((l <- length(nm)) > 1L) 
        body(denss) <- parse(text = paste("densfun(x,", paste("parm[",1L:l,"]", collapse = ", "), ", ...)"))
    penss <- function (parm, x, ...) distfun (x, parm, ...)
    if ((l <- length(nm)) > 1L) 
        body(penss) <- parse(text = paste("distfun(x,", paste("parm[",1L:l,"]", collapse = ", "), ", ...)"))
    spacing <- function(parm, ...) {
        SP<-penss(parm, ...)
        SQ<-denss(parm, ...)
        L1<-c(0,SP)
        L2<-c(SP,1)
        SP2<-L2-L1
        SQ2 <- c(0, SQ)
        SP3 <- pmax.int(SP2, SQ2)
    return(-sum(log(SP2)))
    }
    Call[[1L]] <- as.name("optim")
    Call$densfun <- Call$distfun <- Call$start <- NULL
    Call$x <- x
    Call$par <- start
    Call$fn <- spacing
    Call$method <- "Nelder-Mead"
    Call$hessian <- FALSE
    Call$control$reltol = .Machine$double.eps
    Call$control$maxit = 30000
    res <- eval.parent(Call)
    k <- length(start)
    EMC<-0.5772156649
    m<-n+1
    gm<-m*(log(m)+EMC)-.5-(1/(12*m))
    sm2<-m*(pi^2/6-1)-.5-1/(6*m)
    sm<-sqrt(sm2)
    C1<-gm-(sqrt(.5*n)*sm)
    C2<-(2*n)^(-.5)*sm
    TT<-(res$value+(k/2)-C1)/C2
    pwr=pchisq(TT,df=n)
    structure(list(estimate = res$par, Value = res$value, TestStat = TT, TestPower = 1-pwr, n = n, message=res$message,
        conv = res$convergence, counts = res$counts))
}