AW: [R] read.table problems
Jens Oehlschlägel-Akiyoshi
jens.oehlschlaegel-akiyoshi@mdfactory.de
Mon, 15 Nov 1999 16:43:04 +0100
Two short remarks
1) I still do changes on my version of read.table() as I might need a proper
csv.reader during the next days,
so perhaps you might want to wait
2) Of course an alternative to patching read.table() is renaming my version
to e.g. read.table.csv()
(but it is not a full csv.reader yet)
Here is an update, which by default strips quotation marks enclosing
colnames, character data or numeric data
With this version you can read in csv data such that
"white.begin" , "predy" , "x" , "y"
" asdf" , 0.367985 , "25.34" , 0
gives as resulting col.names instead of
> names(temp)
[1] "\"highq\"" "\"lowq\"" "\"predy\"" "\"x\"" "\"y\""
> names(temp)
[1] "highq" "lowq" "predy" "x" "y"
and as resulting data
" asdf", 0.367985, 25.34, 0
however it will fail on character data containing the separator or quotation
marks
Best regards
Jens Oehlschlägel-Akiyoshi
read.table <-
function (file
, header = FALSE
, sep = ""
, row.names
, col.names
, as.is = FALSE
, na.strings = "NA"
, skip = 0
, dec.sep = '.' ## added by JOA
, strip.white = TRUE ## added by JOA, set to FALSE for backward
compatibility
, quotation.mark = '"' ## added by JOA
, strip.quotation.marks = TRUE ## added by JOA, set to FALSE for backward
compatibility
)
{
##### remark: this version of read.table seems to works fine for
numerical csv data
##### but will fail on character data containing the separator or
quotation marks
### Start JOA changes
## could have a warning
#if (strip.white && sep[1] %in% c("", " ", "\t"))warning("read.table:
strip.white AND with white space seperator !?")
## .Internal(type.convert) calls do_typecvt() which calls C-library
function strtod() which uses '.' as dec.sep
## thus let's replace other dec.sep here
if (dec.sep[1]==sep[1])stop("parsing rule violation: sep must not equal
dec.sep")
type.convert <- function(x, na.strings = "NA", as.is = FALSE,
dec=dec.sep){
if (dec[1]!="."){
# R-developers may know a more efficient internal function to replace
characters
# I also don't know whether it is efficient to change parameters
before calling .Internal
# however it seems to work
# In case a global option$dec.sep is introduced, this fix needs to be
adapted
x <- gsub(dec[1], '.', x, ignore.case=FALSE, extended=FALSE)
}
.Internal(type.convert(x, na.strings, as.is))
}
quoted.string.matcher <- paste('\(^', quotation.mark, '\)\(.*\)\(',
quotation.mark, '$\)', sep='')
unquote <- function(text, quoted.string)sub(quoted.string.matcher,
'\\2', text, extended=T)
### Stop JOA changes
row.lens <- count.fields(file, sep, skip)
nlines <- length(row.lens)
rlabp <- nlines > 1 && (row.lens[2] - row.lens[1]) == 1
if (rlabp && missing(header))
header <- TRUE
if (header) {
col.names <- scan(file, what = "", sep = sep, nlines = 1,
strip.white = strip.white, # added by JOA
quiet = TRUE, skip = skip)
skip <- skip + 1
row.lens <- row.lens[-1]
nlines <- nlines - 1
}
if (strip.quotation.marks) col.names <- unquote(col.names) ## added by
JOA
else if (missing(col.names))
col.names <- paste("V", 1:row.lens[1], sep = "")
cols <- unique(row.lens)
if (length(cols) != 1) {
cat("\nrow.lens=\n")
print(row.lens)
stop("all rows must have the same length.")
}
what <- rep(list(""), cols)
if (rlabp)
col.names <- c("row.names", col.names)
names(what) <- col.names
data <- scan(file = file, what = what, sep = sep, skip = skip,
na.strings = na.strings,
strip.white = strip.white, ## added by JOA
quiet = TRUE)
if (cols != length(data)) {
warning(paste("cols =", cols, " != length(data) =", length(data)))
cols <- length(data)
}
if (is.logical(as.is)) {
as.is <- rep(as.is, length = cols)
}
else if (is.numeric(as.is)) {
if (any(as.is < 1 | as.is > cols))
stop("invalid numeric as.is expression")
i <- rep(FALSE, cols)
i[as.is] <- TRUE
as.is <- i
}
else if (length(as.is) != cols)
stop(paste("as.is has the wrong length", length(as.is),
"!= cols =", cols))
## Start changed by JOA
if (strip.quotation.marks) {
for (i in 1:cols)
if (as.is[i])
data[[i]] <- unquote(data[[i]])
else
data[[i]] <- type.convert( unquote(data[[i]]) )
} else {
for (i in 1:cols) if (!as.is[i])
data[[i]] <- type.convert(data[[i]])
}
## Stop changed by JOA
if (missing(row.names)) {
if (rlabp) {
row.names <- data[[1]]
data <- data[-1]
}
else row.names <- as.character(1:nlines)
}
else if (is.null(row.names)) {
row.names <- as.character(1:nlines)
}
else if (is.character(row.names)) {
if (length(row.names) == 1) {
rowvar <- (1:cols)[match(col.names, row.names, 0) ==
1]
row.names <- data[[rowvar]]
data <- data[-rowvar]
}
}
else if (is.numeric(row.names) && length(row.names) == 1) {
rlabp <- row.names
row.names <- data[[rlabp]]
data <- data[-rlabp]
}
else stop("invalid row.names specification")
class(data) <- "data.frame"
row.names(data) <- row.names
data
}
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-devel mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !) To: r-devel-request@stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._