[R] Help with Data Transformation

Tue Jan 11 01:28:56 CET 2011

That's fine.  Am I correct that this is the format you want for the output file? 

nams <- c("fldsampid", "CLP_ID", "sacode", "matrix", "etc.", "CL",
"PO4", "SO4AG", "AL", "AS", "B", "V", "Zn", "etc.", "ALK",
"ALKB", "ALKC", "SOLID", "DOC", "TOC", "NO3")

It seems a bit suspicious as you have etc. in there twice.

In any case if I understand what you want all you need to do is arrange those names in the order you want and put them in a vector call it bb.

Then you simply say

newxx <- xx[,bb]
et voilàI

You may want to have a look at indexing in the Introduction to R to get a feeling for what's happening herel

Below is a small example.

However I don't think your data.frame is quite what you think it is.

When I do str(xx) to look at the structure all your variables are being read in as factors, which I suspect is not what you want.  R tries to recognize what type of variable is being read in and often seems to decide a character or even a numeric variable is a factor

You may want to run the command 
options(stringsAsFactors = FALSE)
before you load the data into the data.frame

I hope this is of some help.
#===================================================================
df1 <- structure(list(site = c(1, 1, 4, 4, 1, 4), id = structure(c(1L,
2L, 2L, 3L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"),
    cata = c(1, 1, 6, 1, 1, NA), catb = c(1, 2, 3, 4, 5, 6),
    doga = c(3, 5, 3, 6, 4, 0), dogb = c(2, 4, 6, 8, 10, 12),
    rata = c(NA, 9, 9, 8, 9, 8), ratb = c(1, 2, 3, 4, 5, 6),
    bata = c(12, 42, NA, 45, 32, 54), batb = c(13, 15, 17, 19,
    21, 23)), .Names = c("site", "id", "cata", "catb", "doga",
"dogb", "rata", "ratb", "bata", "batb"), row.names = c("aa",
"bb", "cc", "dd", "ee", "ff"), class = "data.frame")

df1

bb <- c("dogb", "rata", "ratb", "bata", "batb", "site", "id", "cata",
    "catb", "doga")

    newdf <- df1[,bb]

#==================================================================

--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:

> From: Guy Jett <GJett at itsi.com>
> Subject: RE: [R] Help with Data Transformation
> To: "John Kane" <jrkrideau at yahoo.ca>, "r-help at r-project.org" <r-help at r-project.org>
> Received: Monday, January 10, 2011, 5:20 PM
> Thank you John,
> I have put a copy of the dput file below my signature
> block.  I hope that is what you need as I am unfamiliar
> with that function.  Note that empty cells need to
> become "NA".
> The "=" character is part of the "PARVALUE" column.
> Yours,
> Guy
> gjett at itsi.com
> 
> structure(list(X = c(2268L, 2269L, 2270L, 2272L, 2273L,
> 2274L, 
> 2275L, 2276L, 2289L, 2290L, 2291L, 2292L, 2293L, 2294L,
> 2295L, 
> 2296L, 2326L, 2327L, 2328L, 2329L, 2330L, 2331L, 2332L,
> 2346L, 
> 2349L, NA, NA, NA, NA, NA), fldsampid = structure(c(3L, 3L,
> 3L, 
> 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
> 4L, 
> 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 2L, 3L, 4L), .Label = c("",
> "fldsampid", 
> "LHR020GW-01E2", "LHR020SD-00E2"), class = "factor"),
> CLP_ID = structure(c(1L, 
> 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 
> 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 3L), .Label
> = c("", 
> "CLP_ID", "MY77J8"), class = "factor"), sacode =
> structure(c(2L, 
> 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
> 2L, 
> 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 2L), .Label
> = c("", 
> "N", "sacode"), class = "factor"), matrix = structure(c(4L,
> 4L, 
> 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L,
> 3L, 
> 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 4L, 3L), .Label =
> c("", 
> "matrix", "SE", "WG"), class = "factor"), etc. =
> structure(c(1L, 
> 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 
> 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label
> = c("", 
> "etc."), class = "factor"), prccode = structure(c(4L, 4L,
> 4L, 
> 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 8L, 5L, 6L,
> 7L, 
> 5L, 5L, 5L, 5L, 5L, 6L, 1L, 1L, 3L, 2L, NA), .Label = c("",
> "<value>", 
> "CL", "INO", "MET", "MI", "ORG", "SN"), class = "factor"),
> Lab = structure(c(4L, 
> 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
> 4L, 
> 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 5L, 2L, NA), .Label
> = c("", 
> "<value>", "A4SW", "BRLS", "PO4"), class = "factor"),
> EXMCODE = structure(c(5L, 
> 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 5L, 5L,
> 4L, 
> 4L, 5L, 4L, 4L, 4L, 4L, 7L, 4L, 1L, 1L, 6L, 2L, NA), .Label
> = c("", 
> "<value>", "FLDFLT", "METHOD", "NONE", "SO4",
> "SW3050B"), class = "factor"), 
>     Analysis = structure(c(13L, 13L, 13L, 10L,
> 11L, 11L, 11L, 
>     11L, 11L, 11L, 11L, 3L, 3L, 3L, 5L, 13L, 9L,
> 8L, 15L, 7L, 
>     12L, 12L, 12L, 14L, 4L, 1L, 1L, 6L, 2L, 2L),
> .Label = c("", 
>     "<value>", "A2320", "A2540G", "A5310B",
> "AG", "C245.5", "E160.3", 
>     "E1630", "E1631", "E1638", "E200.8", "E300",
> "SW6010B", "SW9060"
>     ), class = "factor"), PARLABEL =
> structure(c(10L, 16L, 17L, 
>     12L, 3L, 4L, 8L, 9L, 20L, 21L, 15L, 5L, 6L,
> 7L, 11L, 14L, 
>     13L, 18L, 19L, 12L, 3L, 8L, 9L, 20L, 18L, 1L,
> 1L, 4L, 2L, 
>     2L), .Label = c("", "<value>", "AG",
> "AL", "ALK", "ALKB", 
>     "ALKC", "AS", "B", "CL", "DOC", "HG", "MEHG",
> "NO3", "PB", 
>     "PO4", "SO4", "SOLID", "TOC", "V", "Zn"),
> class = "factor"), 
>     PARVQ = structure(c(3L, 5L, 3L, 3L, 3L, 3L,
> 3L, 3L, 3L, 3L, 
>     5L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 5L,
> 3L, 3L, 3L, 3L, 
>     1L, 1L, 4L, 2L, 2L), .Label = c("",
> "<value>", "=", "AS", 
>     "ND"), class = "factor"), Result =
> structure(c(12L, 20L, 
>     11L, 3L, 10L, 8L, 14L, 26L, 9L, 5L, 4L, 25L,
> 25L, 13L, 19L, 
>     24L, 15L, 18L, 16L, 21L, 6L, 22L, 7L, 23L,
> 17L, 1L, 1L, 27L, 
>     2L, 2L), .Label = c("", "<value>",
> "0.00171", "0.008", "1.76", 
>     "1050", "11400", "122", "131", "2.57",
> "22460", "23590.9", 
>     "2500", "317", "4.28", "4.823", "47.7",
> "48.45", "49330", 
>     "50", "5100", "5500", "56900", "792",
> "807000", "9970", "B"
>     ), class = "factor"), X.1 = structure(c(1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L),
> .Label = c("", "<value>", 
>     "V"), class = "factor"), X.2 =
> structure(c(1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA),
> .Label = c("", 
>     "<value>", "Zn"), class = "factor"),
> X.3 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, 2L), .Label = c("", 
>     "<value>", "etc."), class = "factor"),
> X.4 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "ALK"), class = "factor"),
> X.5 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "ALKB"), class = "factor"),
> X.6 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "ALKC"), class = "factor"),
> X.7 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, 2L), .Label = c("", 
>     "<value>", "SOLID"), class = "factor"),
> X.8 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "DOC"), class = "factor"),
> X.9 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "TOC"), class = "factor"),
> X.10 = structure(c(1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 1L, 1L, 1L, 1L, 
>     1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
> 3L, 2L, NA), .Label = c("", 
>     "<value>", "NO3"), class = "factor")),
> .Names = c("X", "fldsampid", 
> "CLP_ID", "sacode", "matrix", "etc.", "prccode", "Lab",
> "EXMCODE", 
> "Analysis", "PARLABEL", "PARVQ", "Result", "X.1", "X.2",
> "X.3", 
> "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10"), class =
> "data.frame", row.names = c(NA, 
> -30L))
> 
> 
> -----Original Message-----
> From: John Kane [mailto:jrkrideau at yahoo.ca]
> 
> Sent: Monday, January 10, 2011 1:43 PM
> To: r-help at r-project.org;
> Guy Jett
> Subject: Re: [R] Help with Data Transformation
> 
> That sample data set is really hard to read.  Could
> you resent it after having used dput on it?  
> 
> A file output with dput is easily read into R and makes
> seeing what you need much easier.  BTW what are the =
> doing?
> 
> Thanks
> 
> 
> 
> --- On Mon, 1/10/11, Guy Jett <GJett at itsi.com>
> wrote:
> 
> > From: Guy Jett <GJett at itsi.com>
> > Subject: [R] Help with Data Transformation
> > To: "r-help at r-project.org"
> <r-help at r-project.org>
> > Received: Monday, January 10, 2011, 3:59 PM Greetings,
> I am new to R 
> > and am having trouble with parsing a file with the
> following 
> > characteristics:
> > 
> > *         Individual results
> > for a single sample are written to multiple lines.
> > 
> > *         First 16 columns
> > are constant from sample to sample.
> > 
> > *         Remaining 10 need
> > to be matched up (cross-tabbed?)
> > 
> > o   (the exact contents for the remaining 10 vary
> from sample to 
> > sample, as indicated in the extract
> > below)
> > 
> > *         Ultimate goal is to
> > run various comparisons between the variable columns,
> compare samples 
> > from separate populations, and graph samples from the
> separate 
> > populations.
> > 
> > *         (An extract is
> > provided below)
> > 
> > The data is initially extracted from an SQL database
> into Excel, then 
> > saved as a tab-delimited text file for use in R.
> > I have been successful in using subset() to extract
> specific sample 
> > types, but have not yet been able to transform the
> data so that all 
> > the data needed is on a single line.  I have looked
> at several R 
> > manuals, read through 'R in a Nutshell', prowled the
> help resources (R 
> > Site Search and the Google link), tried stack(),
> subset(), reshape(), 
> > and several other functions, to no avail.
> > 
> > Thank you very much for your help.  This seems like a
> wonderful 
> > community, Guy Jett, R.G.
> > Project Geologist
> > gjett at itsi.com<mailto:gjett at itsi.com>
> > 
> > Example Data Input (subset):
> >                
> > fldsampid
> > CLP_ID  sacode
> > matrix   etc...
> > prccode
> >   Lab
> >    EXMCODE
> >    Analysis
> >       PARLABEL
> >         PARVQ Result
> > 2268       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       INO
> > BRLS      NONE    E300
> >    CL
> >    =
> >    23590.9
> > 2269       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       INO
> > BRLS      NONE    E300
> >    PO4        ND
> >         50
> > 2270       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       INO
> > BRLS      NONE    E300
> >    SO4        =
> >            22460
> > 2272       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1631    HG
> > =
> >    0.00171
> > 2273       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    AG
> > =             2.57
> > 2274       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    AL
> >    =
> >    122
> > 2275       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    AS
> >    =
> >    317
> > 2276       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    B
> >    =
> >    9970
> > 2289       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    V
> >    =
> >    131
> > 2290       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      FLDFLT
> > E1638    Zn
> >    =
> >    1.76
> > 2291       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MET
> >    BRLS      METHOD
> >             E1638
> > PB           ND
> >               0.008
> > 2292       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MI
> > BRLS      NONE    A2320
> >   ALK        =
> >        807000
> > 2293       LHR020GW-01E2
> >                
> >              N
> >            WG
> >                
> >       MI
> > BRLS      NONE    A2320
> >   ALKB      =
> >      807000
> > 2294       LHR020GW-01E2
> >                
> >              N 
> >            WG 
> >                
> >       MI         
> > BRLS      NONE    A2320 
> >   ALKC      ND     
> >     2500
> > 2295       LHR020GW-01E2 
> >                
> >              N 
> >            WG 
> >                
> >       ORG   
> >    BRLS      NONE   
> > A5310B DOC       =   
> >          49330
> > 2296       LHR020GW-01E2 
> >                
> >              N 
> >            WG 
> >                
> >       SN         
> > BRLS      NONE    E300   
> >    NO3       = 
> >            792
> > 2326       LHR020SD-00E2 
> >            
> >    N         
> >    SE         
> >              
> >    MET   
> >    BRLS      METHOD 
> >             E1630   
> > MEHG   =         
> >    4.28
> > 2327       LHR020SD-00E2 
> >            
> >    N         
> >    SE         
> >              
> >    MI         
> > BRLS      METHOD       
> >       E160.3   SOLID 
> >   =         
> >    48.45
> > 2328       LHR020SD-00E2 
> >            
> >    N         
> >    SE         
> >              
> >    ORG   
> >    BRLS      NONE   
> > SW9060             
> >   TOC        =     
> >        4.823
> > 2329       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MET   
> >    A4SW    METHOD   
> >          
> > C245.5   HG         
> > =             5100
> > 2330       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MET   
> >    A4SW    METHOD   
> >          
> > E200.8   AG         
> > ND          1050
> > 2331       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MET   
> >    A4SW    METHOD   
> >          
> > E200.8   AS       
> >    =         
> >    5500
> > 2332       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MET   
> >    A4SW    METHOD   
> >          
> > E200.8   B         
> >    =         
> >    11400
> > 2346       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MET   
> >    A4SW    SW3050B   
> >          SW6010B   
> >          V     
> >        =       
> >         56900
> > 2349       LHR020SD-00E2 MY77J8
> > N         
> >    SE         
> >              
> >    MI         
> > A4SW    METHOD         
> >     A2540G         
> >       SOLID    =   
> >          47.7
> > 
> > Desired output:
> >                
> > fldsampid           
> > CLP_ID  sacode 
> > matrix   etc...      CL 
> >          PO4   
> >     SO4        AG   
> >       AL       
> >    AS       
> >    B         
> >    V         
> >    Zn         
> >       etc...      ALK 
> >       ALKB      ALKC 
> >     SOLID    DOC   
> >    TOC        NO3
> >                
> > LHR020GW-01E2           
> >                
> >    N         
> >    WG         
> >              
> > <value>           
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>
> >                
> > LHR020SD-00E2 MY77J8 N         
> >    SE         
> >              
> >    NA         
> > NA          NA     
> >     <value>       
> >         <value>   
> >            
> > <value>           
> >     <value>       
> >         <value>   
> >             NA   
> >       <value>     
> >           NA     
> >     NA          NA 
> >         <value>   
> >             NA   
> >       NA         
> > NA
> > 
> >     [[alternative HTML version deleted]]
> > 
> > ______________________________________________
> > R-help at r-project.org
> > mailing list
> > https://stat.ethz.ch/mailman/listinfo/r-help
> > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > and provide commented, minimal, self-contained,
> > reproducible code.
> > 
> 
> 
>