AW: [R] read.table problems

Jens Oehlschlägel-Akiyoshi jens.oehlschlaegel-akiyoshi at mdfactory.de
Thu Nov 11 20:49:32 CET 1999


Below is a fix for read.table() to better read in CSV-files.

> # try this
> cat("1;2,3\n4;5,6\n", file="d:/temp/t.dat")
> system("cat d:/temp/t.dat", show=TRUE)
1;2,3
4;5,6
> read.table("d:/temp/t.dat", sep=";", dec.sep=",")
  V1  V2
1  1 2.3
2  4 5.6

> # or this
> read.table("d:/temp/t.dat", sep=",", dec.sep=";")
   V1 V2
1 1.2  3
2 4.5  6

> # or even this
> cat("1 ; 2,3\n4 ; 5,6\n", file="d:/temp/t.dat")
> system("cat d:/temp/t.dat", show=TRUE)
1 ; 2,3
4 ; 5,6
> read.table("d:/temp/t.dat", sep=";", dec.sep=",")
  V1  V2
1  1 2.3
2  4 5.6

> # but not this
> read.table("d:/temp/t.dat", sep=",", dec.sep=";")
     V1 V2
1 1 . 2  3
2 4 . 5  6


R-developers:
I marked my few changes with comments.
Please feel welcome to include them within the R copyright.

BTW: I think read.table() or it's documentation needs another fix,
     because of the following:


> cat("1 2\n3 4\n", file="d:/temp/t.dat")
> system("cat d:/temp/t.dat", show=TRUE)
1 2
3 4
> read.table("d:/temp/t.dat", as.is=TRUE)$V1
[1] "1" "3"


Citing from the help file:

      as.is: the default behavior of `read.table' is to convert
             non-numeric variables to factors.  The variable
             `as.is' controls this conversion.  Its value is
             either a vector of logicals (values are recycled
             if necessary), or a vector of numeric indices
             which specify which columns should be left as
             character strings.


If as.is is supposed to control the treatment of *non-numeric* variables,
I would expect numeric values not to be converted in any case

Best regards


Jens Oehlschlägel-Akiyoshi







read.table <-
function (file
, header = FALSE
, sep = ""
, row.names
, col.names
, as.is = FALSE
, na.strings = "NA"
, skip = 0
, dec.sep = '.'       ## added by JOA
, strip.white = TRUE  ## changed default by JOA, may be set to FALSE for
backward compatibility
)
{
    ### Start JOA changes
    ## could have a warning
    #if (strip.white && sep[1] %in% c("", " ", "\t"))warning("read.table:
strip.white AND with white space seperator !?")
    ## .Internal(type.convert) calls do_typecvt() which calls C-library
function strtod() which uses '.' as dec.sep
    ## thus let's replace other dec.sep here
    if (dec.sep[1]==sep[1])stop("parsing rule violation: sep must not equal
dec.sep")
    type.convert <- function(x, na.strings = "NA", as.is = FALSE,
dec=dec.sep){
    	if (dec[1]!="."){
    	  # R-developers may know a more efficient internal function to replace
characters
    	  # I also don't know whether it is efficient to change parameters
before calling .Internal
    	  # however it seems to work
    	  # In case a global option$dec.sep is introduced, this fix needs to be
adapted
    	  x <- gsub(dec[1], '.', x, ignore.case=FALSE, extended=FALSE)
    	}
      .Internal(type.convert(x, na.strings, as.is))
    }
    ### Stop JOA changes

    row.lens <- count.fields(file, sep, skip)
    nlines <- length(row.lens)
    rlabp <- nlines > 1 && (row.lens[2] - row.lens[1]) == 1
    if (rlabp && missing(header))
        header <- TRUE
    if (header) {
        col.names <- scan(file, what = "", sep = sep, nlines = 1,
            quiet = TRUE, skip = skip)
        skip <- skip + 1
        row.lens <- row.lens[-1]
        nlines <- nlines - 1
    }
    else if (missing(col.names))
        col.names <- paste("V", 1:row.lens[1], sep = "")
    cols <- unique(row.lens)
    if (length(cols) != 1) {
        cat("\nrow.lens=\n")
        print(row.lens)
        stop("all rows must have the same length.")
    }
    what <- rep(list(""), cols)
    if (rlabp)
        col.names <- c("row.names", col.names)
    names(what) <- col.names
    data <- scan(file = file, what = what, sep = sep, skip = skip,
        na.strings = na.strings, strip.white = strip.white, quiet = TRUE) ##
changed by JOA

    if (cols != length(data)) {
        warning(paste("cols =", cols, " != length(data) =", length(data)))
        cols <- length(data)
    }
    if (is.logical(as.is)) {
        as.is <- rep(as.is, length = cols)
    }
    else if (is.numeric(as.is)) {
        if (any(as.is < 1 | as.is > cols))
            stop("invalid numeric as.is expression")
        i <- rep(FALSE, cols)
        i[as.is] <- TRUE
        as.is <- i
    }
    else if (length(as.is) != cols)
        stop(paste("as.is has the wrong length", length(as.is),
            "!= cols =", cols))
    for (i in 1:cols) if (!as.is[i])
        data[[i]] <- type.convert(data[[i]])
    if (missing(row.names)) {
        if (rlabp) {
            row.names <- data[[1]]
            data <- data[-1]
        }
        else row.names <- as.character(1:nlines)
    }
    else if (is.null(row.names)) {
        row.names <- as.character(1:nlines)
    }
    else if (is.character(row.names)) {
        if (length(row.names) == 1) {
            rowvar <- (1:cols)[match(col.names, row.names, 0) ==
                1]
            row.names <- data[[rowvar]]
            data <- data[-rowvar]
        }
    }
    else if (is.numeric(row.names) && length(row.names) == 1) {
        rlabp <- row.names
        row.names <- data[[rlabp]]
        data <- data[-rlabp]
    }
    else stop("invalid row.names specification")
    class(data) <- "data.frame"
    row.names(data) <- row.names
    data
}



## Example (Windows Paths)

# try this
cat("1;2,3\n4;5,6\n", file="d:/temp/t.dat")
system("cat d:/temp/t.dat", show=TRUE)
read.table("d:/temp/t.dat", sep=";", dec.sep=",")
# or this
read.table("d:/temp/t.dat", sep=",", dec.sep=";")
# or even this
cat("1 ; 2,3\n4 ; 5,6\n", file="d:/temp/t.dat")
system("cat d:/temp/t.dat", show=TRUE)
read.table("d:/temp/t.dat", sep=";", dec.sep=",")
# but not this
read.table("d:/temp/t.dat", sep=",", dec.sep=";")

#BTW:
cat("1 2\n3 4\n", file="d:/temp/t.dat")
system("cat d:/temp/t.dat", show=TRUE)
read.table("d:/temp/t.dat", as.is=TRUE)$V1


--
Dr. Jens Oehlschlägel-Akiyoshi
MD FACTORY GmbH
Bayerstrasse 21

80335 München

Tel.: 089 545 28-27
Fax.: 089 545 28-10
http://www.mdfactory.de

-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-help mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !)  To: r-help-request at stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._



More information about the R-help mailing list