[Rd] writeForeignSAS and potential extensions
Stephen Weigand
Weigand.Stephen at mayo.edu
Thu Jul 13 20:48:19 CEST 2006
Dear R-devel,
I've made some potential extensions to writeForeignSAS
in 'foreign' that I wanted to pass along if anyone is
interested. I've attached the diff -u output against
the version found in foreign_0.8-15 and an .R file
with my changes. (In this .R file, the function is named
writeForeignSAS7 to simplify testing/comparisons.)
I've tried to alter the current version as little as
possible while making the following changes:
* Try to convert data.frame names to SAS-legal names and
allow the user to specify an 8- or 32-character limit.
* For factors, try to convert the variable name to a
SAS-legal 8-character name not ending in a digit
* Read in 'datafile' with DSD specified in the INFILE
statement. SAS says this "changes how SAS treats
delimiters when list input is used and sets the default
delimiter to a comma. When you specify DSD, SAS treats
two consecutive delimiters as a missing value and
removes quotation marks from character values." The
point of this is the added safety of using 'quote=TRUE'
when writing 'datafile' via write.table
* Functionality to write out Dates and read them in with
an INFORMAT statement
* Functionality to write out datetime variables
(assuming a class of POSIXct) and read them in with an
INFORMAT statement
* In order to handle character variables a bit better,
use a LENGTH statement to tell SAS the maximum character
width of values in the variable. Without this, some
character values can be truncated.
If it'd be helpful to make any changes or add anything,
I'd be happy try to do so.
Finally, some testing code that works in SAS 6.12, 8.2,
and 9.
d <-
structure(list(a.b = as.integer(c(1, 2)),
alphabetsoup =
structure(as.integer(c(1, 2)),
.Label = c("A", "B"),
class = "factor"),
datevar1 = structure(c(13342, 12977),
class = "Date"),
datetimevar1 = structure(c(1152802685,
1152716285),
class = c("POSIXt", "POSIXct")),
charactervariable = c("L",
"Last, First")),
.Names = c("a.b", "alphabetsoup",
"datevar1", "datetimevar1",
"charactervariable"),
row.names = c("1", "2"),
class = "data.frame")
require(foreign)
### adQuote here to (temporarily) avoid ':::'
adQuote <- function (x) paste("\"", x, "\"", sep = "")
dfile <- file.path(tempdir(), "test.dat")
cfile <- file.path(tempdir(), "test.sas")
write.foreign(d, datafile = dfile, codefile = cfile,
package = "SAS7", validvarname = "V6")
file.show(dfile)
file.show(cfile)
Sincerely,
Stephen
::::::::::::::::::::::::::::::::::
Stephen Weigand
Division of Biostatistics
Mayo Clinic Rochester, Minn., USA
Phone (507) 266-1650, fax 284-9542
-------------- next part --------------
--- writeForeignSAS.R Fri Feb 17 03:30:53 2006
+++ /tmp/writeForeignSAS.R Thu Jul 13 12:24:24 2006
@@ -1,21 +1,52 @@
-writeForeignSAS<-function(df,datafile,codefile,dataname="rdata"){
+make.SAS.names <- function(varnames, validvarname = c("V7", "V6")){
+ validvarname <- match.arg(validvarname)
+ nmax <- if(validvarname == "V7") 32 else 8
+ x <- sub("^([0-9])", "_\\1", varnames)
+ x <- gsub("[^a-zA-Z0-9_]", "_", x)
+ x <- abbreviate(x, minlength = nmax)
+
+ if (any(nchar(x) > nmax) || any(duplicated(x)))
+ stop("Cannot uniquely abbreviate the variable names to ",
+ nmax, " or fewer characters")
+ names(x) <- varnames
+ x
+}
+
+make.SAS.formats <- function(varnames){
+ x <- sub("^([0-9])", "_\\1", varnames)
+ x <- gsub("[^a-zA-Z0-9_]", "_", x)
+ x <- sub("([0-9])$", "\\1f", x) # can't end in digit so append 'f'
+ x <- abbreviate(x, minlength = 8)
+
+ if(any(nchar(x) > 8) || any(duplicated(x)))
+ stop("Cannot uniquely abbreviate format names to conform to ",
+ " eight-character limit and not ending in a digit")
+ names(x) <- varnames
+ x
+}
+
+writeForeignSAS7<-function(df,datafile,codefile,dataname="rdata",
+ validvarname = c("V7", "V6")){
factors <- sapply(df, is.factor)
strings <- sapply(df, is.character)
-
+ dates <- sapply(df, FUN = function(x) inherits(x, "Date"))
+ datetimes <- sapply(df, FUN = function(x) inherits(x, "POSIXct"))
+
varlabels <- names(df)
- varnames <- abbreviate(names(df), 8)
- if (any(sapply(varnames, nchar) > 8))
- stop("Cannot abbreviate the variable names to eight or fewer letters")
- if (any(abbreviated <- (varnames != varlabels)))
- message("Some variable names were abbreviated.")
+ varnames <- make.SAS.names(names(df), validvarname = validvarname)
+ if (any(varnames != varlabels))
+ message("Some variable names were abbreviated or otherwise altered.")
dfn<-df
if (any(factors))
dfn[factors]<-lapply(dfn[factors], as.numeric)
+ if (any(datetimes))
+ dfn[datetimes] <- lapply(dfn[datetimes],
+ FUN = function(x) format(x, "%d%b%Y %H:%M:%S"))
write.table(dfn, file = datafile, row = FALSE, col = FALSE,
- sep = ",", quote = FALSE, na = ".")
+ sep = ",", quote = TRUE, na = "")
lrecl<-max(sapply(readLines(datafile),nchar))+4
cat("* Written by R;\n", file=codefile)
@@ -22,24 +53,50 @@
cat("* ",deparse(sys.call(-2))[1],";\n\n",file=codefile,append=TRUE)
if (any(factors)){
cat("PROC FORMAT;\n",file=codefile,append=TRUE)
- for(v in 1:ncol(df)){
- if (factors[v]){
- cat("value ",varnames[v],"\n",file=codefile,append=TRUE)
- values<-levels(df[[v]])
+ fmtnames <- make.SAS.formats(varnames[factors])
+ fmt.values <- lapply(df[, factors, drop = FALSE], levels)
+ names(fmt.values) <- fmtnames
+ for (f in fmtnames){
+ cat("value",f,"\n",file=codefile,append = TRUE)
+ values<-fmt.values[[f]]
for(i in 1:length(values)){
cat(" ",i,"=",adQuote(values[i]),"\n",file=codefile,append=TRUE)
}
cat(";\n\n",file=codefile,append=TRUE)
- }
- }
+ }
}
cat("DATA ",dataname,";\n",file=codefile,append=TRUE)
+
+ if (any(strings)){
+ cat("LENGTH", file = codefile, append = TRUE)
+ lengths <- sapply(df[,strings, drop = FALSE],
+ FUN = function(x) max(nchar(x)))
+ names(lengths) <- varnames[strings]
+ for(v in varnames[strings])
+ cat("\n", v, "$", lengths[v],file=codefile,append=TRUE)
+ cat("\n;\n\n", file = codefile, append = TRUE)
+ }
+
+ if (any(dates)){
+ cat("INFORMAT", file = codefile, append = TRUE)
+ for(v in varnames[dates])
+ cat("\n", v, file = codefile, append = TRUE)
+ cat("\n YYMMDD10.\n;\n\n", file = codefile, append = TRUE)
+ }
+
+ if (any(datetimes)){
+ cat("INFORMAT", file = codefile, append = TRUE)
+ for(v in varnames[datetimes])
+ cat("\n", v, file = codefile, append = TRUE)
+ cat("\n DATETIME18.\n;\n\n", file = codefile, append = TRUE)
+ }
+
cat("INFILE ",adQuote(datafile),
- "\n DELIMITER=','",
+ "\n DSD",
"\n LRECL=",lrecl,";\n",
file=codefile,append=TRUE)
-
+
cat("INPUT",file=codefile,append=TRUE)
for(v in 1:ncol(df)){
cat("\n",varnames[v],file=codefile,append=TRUE)
@@ -49,16 +106,26 @@
cat("\n;\n",file=codefile,append=TRUE)
for(v in 1:ncol(df)){
- if (abbreviated[v])
+ if (varnames[v] != names(varnames)[v])
cat("LABEL ",varnames[v],"=",adQuote(varlabels[v]),";\n",
file=codefile,append=TRUE)
- }
-
- for(v in 1:ncol(df)){
- if(factors[v])
- cat("FORMAT ",varnames[v],paste(varnames[v],".",sep=""),";\n",
+ }
+
+ if (any(factors)){
+ for (f in 1:length(fmtnames))
+ cat("FORMAT", names(fmtnames)[f],paste(fmtnames[f],".",sep = ""),";\n",
file=codefile,append=TRUE)
}
-
+
+ if (any(dates)){
+ for(v in varnames[dates])
+ cat("FORMAT", v, "yymmdd10.;\n", file = codefile, append = TRUE)
+ }
+
+ if (any(datetimes)){
+ for(v in varnames[datetimes])
+ cat("FORMAT", v, "datetime18.;\n", file = codefile, append = TRUE)
+ }
+
cat("RUN;\n",file=codefile,append=TRUE)
}
-------------- next part --------------
make.SAS.names <- function(varnames, validvarname = c("V7", "V6")){
validvarname <- match.arg(validvarname)
nmax <- if(validvarname == "V7") 32 else 8
x <- sub("^([0-9])", "_\\1", varnames)
x <- gsub("[^a-zA-Z0-9_]", "_", x)
x <- abbreviate(x, minlength = nmax)
if (any(nchar(x) > nmax) || any(duplicated(x)))
stop("Cannot uniquely abbreviate the variable names to ",
nmax, " or fewer characters")
names(x) <- varnames
x
}
make.SAS.formats <- function(varnames){
x <- sub("^([0-9])", "_\\1", varnames)
x <- gsub("[^a-zA-Z0-9_]", "_", x)
x <- sub("([0-9])$", "\\1f", x) # can't end in digit so append 'f'
x <- abbreviate(x, minlength = 8)
if(any(nchar(x) > 8) || any(duplicated(x)))
stop("Cannot uniquely abbreviate format names to conform to ",
" eight-character limit and not ending in a digit")
names(x) <- varnames
x
}
writeForeignSAS7<-function(df,datafile,codefile,dataname="rdata",
validvarname = c("V7", "V6")){
factors <- sapply(df, is.factor)
strings <- sapply(df, is.character)
dates <- sapply(df, FUN = function(x) inherits(x, "Date"))
datetimes <- sapply(df, FUN = function(x) inherits(x, "POSIXct"))
varlabels <- names(df)
varnames <- make.SAS.names(names(df), validvarname = validvarname)
if (any(varnames != varlabels))
message("Some variable names were abbreviated or otherwise altered.")
dfn<-df
if (any(factors))
dfn[factors]<-lapply(dfn[factors], as.numeric)
if (any(datetimes))
dfn[datetimes] <- lapply(dfn[datetimes],
FUN = function(x) format(x, "%d%b%Y %H:%M:%S"))
write.table(dfn, file = datafile, row = FALSE, col = FALSE,
sep = ",", quote = TRUE, na = "")
lrecl<-max(sapply(readLines(datafile),nchar))+4
cat("* Written by R;\n", file=codefile)
cat("* ",deparse(sys.call(-2))[1],";\n\n",file=codefile,append=TRUE)
if (any(factors)){
cat("PROC FORMAT;\n",file=codefile,append=TRUE)
fmtnames <- make.SAS.formats(varnames[factors])
fmt.values <- lapply(df[, factors, drop = FALSE], levels)
names(fmt.values) <- fmtnames
for (f in fmtnames){
cat("value",f,"\n",file=codefile,append = TRUE)
values<-fmt.values[[f]]
for(i in 1:length(values)){
cat(" ",i,"=",adQuote(values[i]),"\n",file=codefile,append=TRUE)
}
cat(";\n\n",file=codefile,append=TRUE)
}
}
cat("DATA ",dataname,";\n",file=codefile,append=TRUE)
if (any(strings)){
cat("LENGTH", file = codefile, append = TRUE)
lengths <- sapply(df[,strings, drop = FALSE],
FUN = function(x) max(nchar(x)))
names(lengths) <- varnames[strings]
for(v in varnames[strings])
cat("\n", v, "$", lengths[v],file=codefile,append=TRUE)
cat("\n;\n\n", file = codefile, append = TRUE)
}
if (any(dates)){
cat("INFORMAT", file = codefile, append = TRUE)
for(v in varnames[dates])
cat("\n", v, file = codefile, append = TRUE)
cat("\n YYMMDD10.\n;\n\n", file = codefile, append = TRUE)
}
if (any(datetimes)){
cat("INFORMAT", file = codefile, append = TRUE)
for(v in varnames[datetimes])
cat("\n", v, file = codefile, append = TRUE)
cat("\n DATETIME18.\n;\n\n", file = codefile, append = TRUE)
}
cat("INFILE ",adQuote(datafile),
"\n DSD",
"\n LRECL=",lrecl,";\n",
file=codefile,append=TRUE)
cat("INPUT",file=codefile,append=TRUE)
for(v in 1:ncol(df)){
cat("\n",varnames[v],file=codefile,append=TRUE)
if(strings[v])
cat(" $ ",file=codefile,append=TRUE)
}
cat("\n;\n",file=codefile,append=TRUE)
for(v in 1:ncol(df)){
if (varnames[v] != names(varnames)[v])
cat("LABEL ",varnames[v],"=",adQuote(varlabels[v]),";\n",
file=codefile,append=TRUE)
}
if (any(factors)){
for (f in 1:length(fmtnames))
cat("FORMAT", names(fmtnames)[f],paste(fmtnames[f],".",sep = ""),";\n",
file=codefile,append=TRUE)
}
if (any(dates)){
for(v in varnames[dates])
cat("FORMAT", v, "yymmdd10.;\n", file = codefile, append = TRUE)
}
if (any(datetimes)){
for(v in varnames[datetimes])
cat("FORMAT", v, "datetime18.;\n", file = codefile, append = TRUE)
}
cat("RUN;\n",file=codefile,append=TRUE)
}
More information about the R-devel
mailing list