[R] Reading CSV file with unequal record length
Peter Dalgaard
p.dalgaard at biostat.ku.dk
Wed Jul 2 23:20:09 CEST 2008
Viswanathan Shankar wrote:
> Hello ,
> I am having some difficulty reading a CSV file of unequal record
> length in R . The data has 26 columns and do not have header and is
> generated from a R syntax -
> write.table(schat,"schat.csv", sep=",", col.names=FALSE, append = TRUE)
>
> 1.0,1.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.5,1.9,2.7,,,,
>
> 1.0,2.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.6,1.9,2.2,2.7,,,
>
> 1.0,3.0,0.0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,1.0,1.2,1.4,1.7,2.1,3.1,5.0,,,,,
>
> 1.0,4.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.7,0.7,0.9,1.0,1.2,1.4,1.7,2.2,3.0,,,,,
>
> 1.0,5.0,0.0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.4,1.6,1.9,2.4,3.3,,,,
>
> 1.0,6.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.1,1.3,1.7,2.1,3.4,,,,,
>
> 1.0,7.0,0.0,0.1,0.1,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.7,2.0,2.5,3.3,5.5,,,
>
> 1.0,8.0,0.0,0.1,0.1,0.2,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.6,0.7,0.8,0.9,1.0,1.2,1.3,1.5,1.7,2.0,2.3,2.8,4.2
>
> 1.0,9.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.4,1.6,1.9,2.2,2.9,4.2,,
>
> 1.0,10.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.8,1.0,1.3,1.6,2.4,3.6,6.0,,,,,,,
>
>
> when I use the following syntax to read the above written data
>
> schat_n<-data.frame(read.table("schat.csv", sep=",", header = FALSE,
> fill=TRUE))
>
> the data is fine until record # 7 but it gets wrapped on id 8 & 9 and
> limits the column to 23 and remaining values are made into second
> record as shown below with 12 records instead 10
>
> 1.0,1.0,1.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.5,1.9,2.7,NA
>
> 2.0,1.0,2.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.6,1.9,2.2,2.7
>
> 3.0,1.0,3.0,0.0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,1.0,1.2,1.4,1.7,2.1,3.1,5.0,NA,NA
>
> 4.0,1.0,4.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.7,0.7,0.9,1.0,1.2,1.4,1.7,2.2,3.0,NA,NA
>
> 5.0,1.0,5.0,0.0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.4,1.6,1.9,2.4,3.3,NA
>
> 6.0,1.0,6.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.1,1.3,1.7,2.1,3.4,NA,NA
>
> 7.0,1.0,7.0,0.0,0.1,0.1,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.7,2.0,2.5,3.3,5.5
>
> 8.0,1.0,8.0,0.0,0.1,0.1,0.2,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.6,0.7,0.8,0.9,1.0,1.2,1.3,1.5,1.7,2.0
>
> 9.0,2.3,2.8,4.2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
>
> 10.0,1.0,9.0,0.0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.2,1.4,1.6,1.9,2.2,2.9
>
> 11.0,4.2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
>
> 12.0,1.0,10.0,0.0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.8,1.0,1.3,1.6,2.4,3.6,6.0,NA,NA,NA,NA
>
>
> I would like the dataset to be read as is with 10 records and 26
> columns, any inputs to get this fixed is greatly appreciable.
>
Hmmm, I can't reproduce this (old version of R?). Copying from your mail
gives
>
write.table(read.table("clipboard",sep=",",fill=TRUE),sep=",",col.names=F)
"1",1,1,0,0.1,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2,1.5,1.9,2.7,NA,NA,NA,NA
"2",1,2,0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.6,1.9,2.2,2.7,NA,NA,NA
"3",1,3,0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,1,1.2,1.4,1.7,2.1,3.1,5,NA,NA,NA,NA,NA
"4",1,4,0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.7,0.7,0.9,1,1.2,1.4,1.7,2.2,3,NA,NA,NA,NA,NA
"5",1,5,0,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2,1.4,1.6,1.9,2.4,3.3,NA,NA,NA,NA
"6",1,6,0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1.1,1.3,1.7,2.1,3.4,NA,NA,NA,NA,NA
"7",1,7,0,0.1,0.1,0.2,0.3,0.3,0.4,0.5,0.5,0.6,0.7,0.8,0.9,1.1,1.2,1.4,1.7,2,2.5,3.3,5.5,NA,NA,NA
"8",1,8,0,0.1,0.1,0.2,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.6,0.7,0.8,0.9,1,1.2,1.3,1.5,1.7,2,2.3,2.8,4.2
"9",1,9,0,0.1,0.1,0.1,0.2,0.2,0.3,0.4,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2,1.4,1.6,1.9,2.2,2.9,4.2,NA,NA
"10",1,10,0,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.5,0.6,0.8,1,1.3,1.6,2.4,3.6,6,NA,NA,NA,NA,NA,NA,NA
and read.csv(......, header=FALSE) also works.
In general, the first five lines are used to determine the length of a
line, and in this case, these are all shorter than the 8th one. However
the trailing commas _should_ give the right count.
Anyways, you might try col.names=paste("V", 1:26, sep="") in your
read.table call.
> Thank you in advance.
>
> Shankar
>
>
>
>
--
O__ ---- Peter Dalgaard Øster Farimagsgade 5, Entr.B
c/ /'_ --- Dept. of Biostatistics PO Box 2099, 1014 Cph. K
(*) \(*) -- University of Copenhagen Denmark Ph: (+45) 35327918
~~~~~~~~~~ - (p.dalgaard at biostat.ku.dk) FAX: (+45) 35327907
More information about the R-help
mailing list