[R] What is the fastest way to see what are in an RData file?
Gustaf Rydevik
gustaf.rydevik at gmail.com
Thu Dec 17 12:33:29 CET 2009
On Wed, Dec 16, 2009 at 10:13 PM, Peng Yu <pengyu.ut at gmail.com> wrote:
>
> Currently, I load the RData file then ls() and str(). But loading the file
> takes too long if the file is big. Most of the time, I only interested what
> the variables are in the the file and the attributes of the variables (like
> if it is a data.frame, matrix, what are the colnames/rownames, etc.)
>
> I'm wondering if there is any facility in R to help me avoid loading the
> whole file.
I thought this was interesting as well, so i did a bit of searching
through the R-help list archives and found this answer by Simon
Urbanek:
https://stat.ethz.ch/pipermail/r-devel/2007-August/046724.html
The link to a c-routine that does what you want still works, but for
future reference I'm pasting the code below.
Regards,
Gustaf
----------------------------
/* rdcopy v0.1-0 - extract objects or display contents of RData RDX2 files
*
* Copyright (C) 2007 Simon Urbanek
* based in part on src/main/serialize.c and src/main/saveload.c from R:
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
* Copyright (C) 1997--2007 Robert Gentleman, Ross Ihaka and the
* R Development Core Team
* License: GPL v2
*
* Although R includes are needed to compile this (for constants),
* libR does NOT have to be linked.
*/
#include <stdio.h>
#include <rpc/types.h>
#include <rpc/xdr.h>
#include <R.h>
#include <Rinternals.h>
#ifndef _
#define _(X) X
#endif
#undef error
void error(char *fmt, ...) {
va_list(ap);
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
exit(1);
}
/* .RData:
byte 0..4 XDR2. - file magic ("XDR2\n"=XDR ver2)
byte 5..6 X. - format ("A\n"=ASCII, "B\n"=binary, "X\n"=XDR)
byte 7... RXDR2 stream.
Note: RXDR2 format in NOT a valid XDR format! Strings and
raw bytes are not padded and thus cannot be read
using XDR alone.
*/
/* we need to override this so that we don't have to really use libR */
SEXP R_NilValue = 0;
/* those are directly from serialize.c */
#define REFSXP 255
#define NILVALUE_SXP 254
#define GLOBALENV_SXP 253
#define UNBOUNDVALUE_SXP 252
#define MISSINGARG_SXP 251
#define BASENAMESPACE_SXP 250
#define NAMESPACESXP 249
#define PACKAGESXP 248
#define PERSISTSXP 247
#define CLASSREFSXP 246
#define GENERICREFSXP 245
#define BCREPDEF 244
#define BCREPREF 243
#define EMPTYENV_SXP 242
#define BASEENV_SXP 241
/* map type to a name */
static const char *nameSEXP(int type) {
switch (type) {
case REFSXP: return "REF";
case NILVALUE_SXP: return "NULL";
case GLOBALENV_SXP: return ".GlobalEnv";
case UNBOUNDVALUE_SXP: return "<unbound>";
case MISSINGARG_SXP: return "<missing>";
case BASENAMESPACE_SXP: return "<<base>>";
case NAMESPACESXP: return "NAMESPACE";
case PACKAGESXP: return "PACKAGE";
case PERSISTSXP: return "PERSIST";
case CLASSREFSXP: return "CLASSREF";
case GENERICREFSXP: return "GENERICREF";
case BCREPDEF: return "BC-REP-DEF";
case BCREPREF: return "BC-REP-REF";
case EMPTYENV_SXP: return "<empty-env>";
case BASEENV_SXP: return "<base-env>";
case NILSXP: return "NIL";
case SYMSXP: return "SYM";
case LISTSXP: return "LIST";
case CLOSXP: return "CLO";
case ENVSXP: return "ENV";
case PROMSXP: return "PROM";
case LANGSXP: return "LANG";
case SPECIALSXP: return "SPECIAL";
case BUILTINSXP: return "BUILTIN";
case CHARSXP: return "CHAR";
case LGLSXP: return "LGL";
case INTSXP: return "INT";
case REALSXP: return "REAL";
case CPLXSXP: return "CPLX";
case STRSXP: return "STR";
case DOTSXP: return "...";
case ANYSXP: return "ANY";
case VECSXP: return "VEC";
case EXPRSXP: return "EXPR";
case BCODESXP: return "BCODE";
case EXTPTRSXP: return "EXTPTR";
case WEAKREFSXP: return "WEAKREF";
case RAWSXP: return "RAW";
case S4SXP: return "S4";
}
return "?";
}
/* again from serialize.c */
#define IS_OBJECT_BIT_MASK (1 << 8)
#define HAS_ATTR_BIT_MASK (1 << 9)
#define HAS_TAG_BIT_MASK (1 << 10)
#define ENCODE_LEVELS(v) (v << 12)
#define DECODE_LEVELS(v) (v >> 12)
#define DECODE_TYPE(v) (v & 255)
/* this structure is passed acros all functions. it encapsulates both
the reading an book-keeping */
typedef struct {
XDR xdrs;
char *buf;
long bs;
FILE *f;
int lev;
char *flag;
int refs;
long *ref; /* reference offsets */
int maxrefs; /* length of the refes vector */
int verb;
int mode;
int flags;
long target;
FILE *copyf;
} SaveLoadData;
#define M_Read 0
#define M_NonRefCopy 1
#define M_Copy 2
#define M_NonRefSelect 3
#define F_NOREF 1
/* the following is partially based on src/main/saveload.c from R */
static void XdrInInit(FILE *fp, SaveLoadData *d, long sbsize)
{
xdrstdio_create(&d->xdrs, fp, XDR_DECODE);
d->buf = (char*) malloc(sbsize);
if (!(d->buf))
error(_("cannot allocate memory for a string buffer"));
d->bs = sbsize;
d->f = fp;
d->lev = 0;
d->flag = 0;
d->flags = 0;
d->refs = 0;
d->maxrefs = 2048;
d->ref = (long*) malloc(sizeof(long)*d->maxrefs);
d->copyf = 0;
d->mode = M_Read;
}
static void XdrInTerm(SaveLoadData *d)
{
xdr_destroy(&d->xdrs);
free(d->buf);
if (d->f) fclose(d->f);
if (d->copyf) fclose(d->copyf);
}
static void XdrSkipBytes(SaveLoadData *d, int n) {
while (n > d->bs) {
XdrSkipBytes(d, d->bs);
n-=d->bs;
}
fread(d->buf, 1, n, d->f);
if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy))
fwrite(d->buf, 1, n, d->copyf);
/* fseek(d->f, n, SEEK_CUR); */
}
static int XdrInInteger(SaveLoadData *d)
{
int i=0;
if (!xdr_int(&d->xdrs, &i)) {
xdr_destroy(&d->xdrs);
error(_("a I read error occurred"));
}
if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
unsigned int y = (unsigned int) i;
unsigned char ib[4];
ib[0]=y>>24; ib[1]=(y>>16)&255; ib[2]=(y>>8)&255; ib[3]=y&255;
fwrite(ib, 1, 4, d->copyf);
}
return i;
}
static double XdrInReal(SaveLoadData *d)
{
double x;
if (!xdr_double(&d->xdrs, &x)) {
xdr_destroy(&d->xdrs);
error(_("a R read error occurred"));
}
if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
unsigned long long y = *((unsigned long long*) &x);
unsigned char ib[8];
ib[0]=y>>56; ib[1]=(y>>48)&255; ib[2]=(y>>40)&255; ib[3]=(y>>32)&255;
ib[4]=(y>>24)&255; ib[5]=(y>>16)&255; ib[6]=(y>>8)&255; ib[7]=y&255;
fwrite(ib, 1, 8, d->copyf);
}
return x;
}
static Rcomplex XdrInComplex(SaveLoadData *d)
{
Rcomplex x;
if (!xdr_double(&d->xdrs, &(x.r)) || !xdr_double(&d->xdrs, &(x.i))) {
xdr_destroy(&d->xdrs);
error(_("a CR read error occurred"));
}
if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
unsigned long long y = *((unsigned long long*) &x.r);
unsigned long long v = *((unsigned long long*) &x.i);
unsigned char ib[16];
ib[0]=y>>56; ib[1]=(y>>48)&255; ib[2]=(y>>40)&255; ib[3]=(y>>32)&255;
ib[4]=(y>>24)&255; ib[5]=(y>>16)&255; ib[6]=(y>>8)&255; ib[7]=y&255;
ib[8]=v>>56; ib[9]=(v>>48)&255; ib[10]=(v>>40)&255; ib[11]=(v>>32)&255;
ib[12]=(v>>24)&255; ib[13]=(v>>16)&255; ib[14]=(v>>8)&255; ib[15]=v&255;
fwrite(ib, 1, 16, d->copyf);
}
return x;
}
static char *XdrInBytes(SaveLoadData *d, char *buf, unsigned int len) {
if (!buf) {
XdrSkipBytes(d, len);
return d->buf;
}
fread(buf, 1, len, d->f);
if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy))
fwrite(buf, 1, len, d->copyf);
return buf;
}
static char *XdrInString(SaveLoadData *d)
{
if (!xdr_string(&d->xdrs, &d->buf, d->bs)) {
xdr_destroy(&d->xdrs);
error(_("a S read error occurred"));
}
return d->buf;
}
/* back to serialize.c */
#define UNPACK_REF_INDEX(i) ((i) >> 8)
static SEXP ReadItem(SaveLoadData *d);
static SEXP InStringVec(SaveLoadData *d) {
int per = XdrInInteger(d);
int len = XdrInInteger(d);
int i = 0;
while (i < len) {
ReadItem(d);
i++;
}
return R_NilValue;
}
static void AddReadRef(SaveLoadData *d, long off) {
if (d->flags & F_NOREF) return;
d->ref[d->refs++] = off;
if (d->verb) printf(" {ref=%d}", d->refs);
if (d->refs>=d->maxrefs) {
XdrInTerm(d);
error(_("too many references in the data file"));
}
}
#define InVec(fp, obj, accessor, infunc, length) \
{ \
int cnt; \
for (cnt = 0; cnt < length; ++cnt) \
/*accessor(obj, cnt,*/ infunc(fp); \
}
static SEXP ReadBCLang(SaveLoadData *d, int type) {
switch (type) {
case BCREPREF:
XdrInInteger(d);
return R_NilValue;
case BCREPDEF:
case LANGSXP:
case LISTSXP:
{
int pos = -1;
if (type == BCREPDEF) {
pos = XdrInInteger(d);
type = XdrInInteger(d);
}
/*TAG*/ ReadItem(d);
/*CAR*/ ReadBCLang(d, XdrInInteger(d));
/*CDR*/ ReadBCLang(d, XdrInInteger(d));
return R_NilValue;
}
default: return ReadItem(d);
}
}
static SEXP ReadBC(SaveLoadData *d) {
ReadItem(d); /* code */
{ /* consts */
int blen = XdrInInteger(d);
int bc = 0;
while (bc < blen) {
int type = XdrInInteger(d);
switch (type) {
case BCODESXP:
ReadBC(d);
break;
case LANGSXP:
case LISTSXP:
case BCREPDEF:
case BCREPREF:
ReadBCLang(d, type);
break;
default:
ReadItem(d);
}
bc++;
}
}
}
static SEXP ReadItem_(SaveLoadData *d, long boe, int cut);
static SEXP ReadItem(SaveLoadData *d) {
long boe = ftell(d->f);
if (d->mode==M_NonRefSelect && boe==d->target) {
printf(" -> saving object at %ld\n", boe);
d->mode=M_NonRefCopy;
ReadItem_(d, boe, 1);
d->mode=M_Read;
} else ReadItem_(d, boe, 0);
return 0;
}
static SEXP ReadItem_(SaveLoadData *d, long boe, int cut) {
int flags = XdrInInteger(d);
int type = DECODE_TYPE(flags);
int lev = DECODE_LEVELS(flags);
int hasattr = flags & HAS_ATTR_BIT_MASK ? 1 : 0;
int hastag = flags & HAS_TAG_BIT_MASK ? 1 : 0;
int isobj = flags & IS_OBJECT_BIT_MASK ? 1 : 0;
SEXP s = R_NilValue;
int len;
int isroot = 0;
char px[64], *cpx=px+d->lev; *cpx=0; while (--cpx>=px) *cpx=' ';
if (!d->flag) d->flag="";
if (type!=CHARSXP && d->verb)
printf("\n@%-7ld%s%s %s %08x [type=%d%s%s%s]", boe, px, d->flag,
nameSEXP(type), flags, type, hasattr?",ATTR":"", hastag?",TAG":"",
isobj?",OBJ":"");
d->flag="";
switch(type) {
case NILVALUE_SXP: return 0/* R_NilValue */;
case EMPTYENV_SXP: return 0/*R_EmptyEnv*/;
case BASEENV_SXP: return 0/*R_BaseEnv*/;
case GLOBALENV_SXP: return 0/*R_GlobalEnv*/;
case UNBOUNDVALUE_SXP: return 0/*R_UnboundValue*/;
case MISSINGARG_SXP: return 0/*R_MissingArg*/;
case BASENAMESPACE_SXP:
return 0/*R_BaseNamespace*/;
case REFSXP:
{
int refi = UNPACK_REF_INDEX(flags);
if (!refi) refi = XdrInInteger(d);
if (d->verb) printf("<REFSXP: %d>", refi);
if (d->mode==M_NonRefCopy) {
long cp = ftell(d->f);
long cop = ftell(d->copyf);
long back = -4;
SaveLoadData e;
e.verb=0;
if (!UNPACK_REF_INDEX(flags)) back -= 4;
if (refi<1 || refi>d->refs) {
XdrInTerm(d);
error(_("invalid reference %d"), refi);
}
if (fseek(d->f, d->ref[refi-1], SEEK_SET)) {
XdrInTerm(d);
error(_("unable to seek to reference %d"), refi);
}
if (fseek(d->copyf, back, SEEK_CUR)) { /* backup to overwise the
reference */
XdrInTerm(d);
error(_("unable to seek in the output stream"));
}
XdrInInit(d->f, &e, d->bs);
e.flags=F_NOREF; e.copyf=d->copyf; e.mode=d->mode;
ReadItem(&e);
e.copyf=0; e.f=0; /* we need to delete those to Term doesn't close them */
XdrInTerm(&e);
if (fseek(d->f, cp, SEEK_SET)) {
XdrInTerm(d);
error(_("unable to return to reference point"));
}
}
return R_NilValue;
}
case PERSISTSXP:
InStringVec(d);
AddReadRef(d, boe);
return s;
case SYMSXP:
d->lev++;
ReadItem(d); /* print name */
AddReadRef(d, boe);
d->lev--;
return s;
case PACKAGESXP:
InStringVec(d);
AddReadRef(d, boe);
return s;
case NAMESPACESXP:
InStringVec(d);
AddReadRef(d, boe);
return s;
case ENVSXP:
{
int locked = XdrInInteger(d);
AddReadRef(d, boe);
d->lev++;
/*ENCLOS*/ ReadItem(d);
/*FRAME*/ ReadItem(d);
/*TAG*/ ReadItem(d);
/*ATTR*/ ReadItem(d);
/* We don't write out the object bit for environments, so
reconstruct it here if needed. */
/* Convert a NULL enclosure to baseenv()
if (ENCLOS(s) == R_NilValue) SET_ENCLOS(s, R_BaseEnv); */
d->lev--;
return s;
}
case LISTSXP:
if (d->lev==0) isroot=1;
case LANGSXP:
case CLOSXP:
case PROMSXP:
case DOTSXP:
d->lev++;
if (hasattr) { d->flag="ATT"; ReadItem(d); }
if (hastag) { d->flag="TAG"; ReadItem(d);
if (isroot) printf(d->verb?"\n%s\t%ld":"%s\t%ld\n", d->buf, boe);
};
/*CAR*/ d->flag="CAR"; ReadItem(d);
if (cut) { /* if this is the selected object, then we cannot
proceed to CDR but close it instead */
unsigned char ib[4] = { 0, 0, 0, NILVALUE_SXP };
fwrite(ib, 1, 4, d->copyf);
d->lev--;
return 0;
}
/*CDR*/ d->flag="CDR"; if (isroot) d->lev=0; ReadItem(d);
/* For reading closures and promises stored in earlier versions,
convert NULL env to baseenv()
if (type == CLOSXP && CLOENV(s) == R_NilValue) SET_CLOENV(s,
R_BaseEnv);
else if (type == PROMSXP && PRENV(s) == R_NilValue) SET_PRENV(s,
R_BaseEnv); */
if (d->lev>0) d->lev--;
isroot=0;
return s;
default:
/* These break out of the switch to have their ATTR, LEVELS, and
OBJECT fields filled in. Each leaves the newly allocated value
PROTECTed */
switch (type) {
case EXTPTRSXP:
d->lev++;
AddReadRef(d, boe);
/*PtrProtected*/ ReadItem(d);
/*PtrTag*/ ReadItem(d);
d->lev--;
break;
case WEAKREFSXP:
AddReadRef(d, boe);
break;
case SPECIALSXP:
case BUILTINSXP:
len = XdrInInteger(d);
XdrInBytes(d, 0, len);
break;
case CHARSXP:
len = XdrInInteger(d);
if (len == -1)
s = 0 /*NA_STRING*/;
else {
char *c = XdrInBytes(d, 0, len);
c[len]=0;
if (d->verb>1) printf(" '%s'", c);
}
break;
case LGLSXP:
len = XdrInInteger(d);
InVec(d, s, SET_LOGICAL_ELT, XdrInInteger, len);
break;
case INTSXP:
len = XdrInInteger(d);
InVec(d, s, SET_INTEGER_ELT, XdrInInteger, len);
break;
case REALSXP:
len = XdrInInteger(d);
InVec(d, s, SET_REAL_ELT, XdrInReal, len);
break;
case CPLXSXP:
len = XdrInInteger(d);
InVec(d, s, SET_COMPLEX_ELT, XdrInComplex, len);
break;
case STRSXP:
{
int count = 0;
len = XdrInInteger(d);
d->lev++;
for (; count < len; ++count)
ReadItem(d);
d->lev--;
}
break;
case VECSXP:
case EXPRSXP:
{
int count = 0;
len = XdrInInteger(d);
d->lev++;
for (; count < len; ++count)
ReadItem(d);
d->lev--;
}
break;
case BCODESXP:
{
int count = 0;
len = XdrInInteger(d);
while (count < len) {
ReadBC(d);
count++;
}
}
break;
case CLASSREFSXP:
error(_("this version of R cannot read class references"));
case GENERICREFSXP:
error(_("this version of R cannot read generic function references"));
case RAWSXP:
len = XdrInInteger(d);
XdrSkipBytes(d, len);
break;
case S4SXP:
break;
default:
s = R_NilValue; /* keep compiler happy */
error(_("ReadItem: unknown type %i, perhaps written by later
version of R"), type);
}
d->lev++;
if (hasattr) ReadItem(d);
d->lev--;
return s;
}
}
int main(int ac, char **av) {
char sig[16];
int ver, wri, rel;
FILE *f, *of = 0;
SaveLoadData sal, *d = &sal;
if (ac<2) {
printf("\n Usage: rdcopy <source> [-v | <target> <offset>]\n\n
Extracts an object from a RData file.\n Use rdlist to obtain all valid
offsets for each object.\n\n");
return 1;
}
f = fopen(av[1], "rb");
sal.verb = 0;
if (!f)
error(_("unable to open file %s"), av[1]);
if (fread(sig, 1, 7, f)!=7) {
fclose(f);
error(_("unable to read magic number"));
}
sig[7]=0;
if (!strcmp(sig, "XDR2\nX\n")) {
{ char *c=sig; while(*c) { if (*c<' ') *c='.'; c++; } }
printf("Format: '%s'\n", sig);
fclose(f);
error(_("XDR v2 is the only supported format"));
}
if (ac>2) {
if (!strcmp(av[2],"-v")) {
d->verb=2;
} else {
of = fopen(av[2], "wb");
if (!of) {
fclose(f);
error(_("unable to create %s"), av[2]);
}
fwrite(sig, 1, 7, of);
}
}
XdrInInit(f, d, 64*1024);
d->mode=of?M_NonRefCopy:M_Read;
d->copyf=of;
d->target=ac>3?atol(av[3]):0;
ver=XdrInInteger(d);
wri=XdrInInteger(d);
rel=XdrInInteger(d);
printf("Format version %x, R version = %d.%d.%d, release = %x\n",
ver, wri>>16, (wri>>8)&255, wri&255, rel);
if (ver != 2) {
XdrInTerm(d);
error(_("Sorry, this tool supported RXDR version 2 format only\n"));
}
if (of) d->mode=M_NonRefSelect;
ReadItem(d);
XdrInTerm(d);
if (d->mode!=M_Read)
printf("\nNo object selected. Please use above offsets to select
an object.\n");
return 0;
}
More information about the R-help
mailing list