[R] Help with Data Transformation
Guy Jett
GJett at itsi.com
Mon Jan 10 23:20:18 CET 2011
Thank you John,
I have put a copy of the dput file below my signature block. I hope that is what you need as I am unfamiliar with that function. Note that empty cells need to become "NA".
The "=" character is part of the "PARVALUE" column.
Yours,
Guy
gjett at itsi.com
structure(list(X = c(2268L, 2269L, 2270L, 2272L, 2273L, 2274L,
2275L, 2276L, 2289L, 2290L, 2291L, 2292L, 2293L, 2294L, 2295L,
2296L, 2326L, 2327L, 2328L, 2329L, 2330L, 2331L, 2332L, 2346L,
2349L, NA, NA, NA, NA, NA), fldsampid = structure(c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 2L, 3L, 4L), .Label = c("", "fldsampid",
"LHR020GW-01E2", "LHR020SD-00E2"), class = "factor"), CLP_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 3L), .Label = c("",
"CLP_ID", "MY77J8"), class = "factor"), sacode = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 2L), .Label = c("",
"N", "sacode"), class = "factor"), matrix = structure(c(4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 4L, 3L), .Label = c("",
"matrix", "SE", "WG"), class = "factor"), etc. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("",
"etc."), class = "factor"), prccode = structure(c(4L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 8L, 5L, 6L, 7L,
5L, 5L, 5L, 5L, 5L, 6L, 1L, 1L, 3L, 2L, NA), .Label = c("", "<value>",
"CL", "INO", "MET", "MI", "ORG", "SN"), class = "factor"), Lab = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 5L, 2L, NA), .Label = c("",
"<value>", "A4SW", "BRLS", "PO4"), class = "factor"), EXMCODE = structure(c(5L,
5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 5L, 5L, 4L,
4L, 5L, 4L, 4L, 4L, 4L, 7L, 4L, 1L, 1L, 6L, 2L, NA), .Label = c("",
"<value>", "FLDFLT", "METHOD", "NONE", "SO4", "SW3050B"), class = "factor"),
Analysis = structure(c(13L, 13L, 13L, 10L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 3L, 3L, 3L, 5L, 13L, 9L, 8L, 15L, 7L,
12L, 12L, 12L, 14L, 4L, 1L, 1L, 6L, 2L, 2L), .Label = c("",
"<value>", "A2320", "A2540G", "A5310B", "AG", "C245.5", "E160.3",
"E1630", "E1631", "E1638", "E200.8", "E300", "SW6010B", "SW9060"
), class = "factor"), PARLABEL = structure(c(10L, 16L, 17L,
12L, 3L, 4L, 8L, 9L, 20L, 21L, 15L, 5L, 6L, 7L, 11L, 14L,
13L, 18L, 19L, 12L, 3L, 8L, 9L, 20L, 18L, 1L, 1L, 4L, 2L,
2L), .Label = c("", "<value>", "AG", "AL", "ALK", "ALKB",
"ALKC", "AS", "B", "CL", "DOC", "HG", "MEHG", "NO3", "PB",
"PO4", "SO4", "SOLID", "TOC", "V", "Zn"), class = "factor"),
PARVQ = structure(c(3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
5L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 3L,
1L, 1L, 4L, 2L, 2L), .Label = c("", "<value>", "=", "AS",
"ND"), class = "factor"), Result = structure(c(12L, 20L,
11L, 3L, 10L, 8L, 14L, 26L, 9L, 5L, 4L, 25L, 25L, 13L, 19L,
24L, 15L, 18L, 16L, 21L, 6L, 22L, 7L, 23L, 17L, 1L, 1L, 27L,
2L, 2L), .Label = c("", "<value>", "0.00171", "0.008", "1.76",
"1050", "11400", "122", "131", "2.57", "22460", "23590.9",
"2500", "317", "4.28", "4.823", "47.7", "48.45", "49330",
"50", "5100", "5500", "56900", "792", "807000", "9970", "B"
), class = "factor"), X.1 = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", "<value>",
"V"), class = "factor"), X.2 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "Zn"), class = "factor"), X.3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("",
"<value>", "etc."), class = "factor"), X.4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "ALK"), class = "factor"), X.5 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "ALKB"), class = "factor"), X.6 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "ALKC"), class = "factor"), X.7 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("",
"<value>", "SOLID"), class = "factor"), X.8 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "DOC"), class = "factor"), X.9 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "TOC"), class = "factor"), X.10 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("",
"<value>", "NO3"), class = "factor")), .Names = c("X", "fldsampid",
"CLP_ID", "sacode", "matrix", "etc.", "prccode", "Lab", "EXMCODE",
"Analysis", "PARLABEL", "PARVQ", "Result", "X.1", "X.2", "X.3",
"X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10"), class = "data.frame", row.names = c(NA,
-30L))
-----Original Message-----
From: John Kane [mailto:jrkrideau at yahoo.ca]
Sent: Monday, January 10, 2011 1:43 PM
To: r-help at r-project.org; Guy Jett
Subject: Re: [R] Help with Data Transformation
That sample data set is really hard to read. Could you resent it after having used dput on it?
A file output with dput is easily read into R and makes seeing what you need much easier. BTW what are the = doing?
Thanks
--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:
> From: Guy Jett <GJett at itsi.com>
> Subject: [R] Help with Data Transformation
> To: "r-help at r-project.org" <r-help at r-project.org>
> Received: Monday, January 10, 2011, 3:59 PM Greetings, I am new to R
> and am having trouble with parsing a file with the following
> characteristics:
>
> * Individual results
> for a single sample are written to multiple lines.
>
> * First 16 columns
> are constant from sample to sample.
>
> * Remaining 10 need
> to be matched up (cross-tabbed?)
>
> o (the exact contents for the remaining 10 vary from sample to
> sample, as indicated in the extract
> below)
>
> * Ultimate goal is to
> run various comparisons between the variable columns, compare samples
> from separate populations, and graph samples from the separate
> populations.
>
> * (An extract is
> provided below)
>
> The data is initially extracted from an SQL database into Excel, then
> saved as a tab-delimited text file for use in R.
> I have been successful in using subset() to extract specific sample
> types, but have not yet been able to transform the data so that all
> the data needed is on a single line. I have looked at several R
> manuals, read through 'R in a Nutshell', prowled the help resources (R
> Site Search and the Google link), tried stack(), subset(), reshape(),
> and several other functions, to no avail.
>
> Thank you very much for your help. This seems like a wonderful
> community, Guy Jett, R.G.
> Project Geologist
> gjett at itsi.com<mailto:gjett at itsi.com>
>
> Example Data Input (subset):
>
> fldsampid
> CLP_ID sacode
> matrix etc...
> prccode
> Lab
> EXMCODE
> Analysis
> PARLABEL
> PARVQ Result
> 2268 LHR020GW-01E2
>
> N
> WG
>
> INO
> BRLS NONE E300
> CL
> =
> 23590.9
> 2269 LHR020GW-01E2
>
> N
> WG
>
> INO
> BRLS NONE E300
> PO4 ND
> 50
> 2270 LHR020GW-01E2
>
> N
> WG
>
> INO
> BRLS NONE E300
> SO4 =
> 22460
> 2272 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1631 HG
> =
> 0.00171
> 2273 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 AG
> = 2.57
> 2274 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 AL
> =
> 122
> 2275 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 AS
> =
> 317
> 2276 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 B
> =
> 9970
> 2289 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 V
> =
> 131
> 2290 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS FLDFLT
> E1638 Zn
> =
> 1.76
> 2291 LHR020GW-01E2
>
> N
> WG
>
> MET
> BRLS METHOD
> E1638
> PB ND
> 0.008
> 2292 LHR020GW-01E2
>
> N
> WG
>
> MI
> BRLS NONE A2320
> ALK =
> 807000
> 2293 LHR020GW-01E2
>
> N
> WG
>
> MI
> BRLS NONE A2320
> ALKB =
> 807000
> 2294 LHR020GW-01E2
>
> N
> WG
>
> MI
> BRLS NONE A2320
> ALKC ND
> 2500
> 2295 LHR020GW-01E2
>
> N
> WG
>
> ORG
> BRLS NONE
> A5310B DOC =
> 49330
> 2296 LHR020GW-01E2
>
> N
> WG
>
> SN
> BRLS NONE E300
> NO3 =
> 792
> 2326 LHR020SD-00E2
>
> N
> SE
>
> MET
> BRLS METHOD
> E1630
> MEHG =
> 4.28
> 2327 LHR020SD-00E2
>
> N
> SE
>
> MI
> BRLS METHOD
> E160.3 SOLID
> =
> 48.45
> 2328 LHR020SD-00E2
>
> N
> SE
>
> ORG
> BRLS NONE
> SW9060
> TOC =
> 4.823
> 2329 LHR020SD-00E2 MY77J8
> N
> SE
>
> MET
> A4SW METHOD
>
> C245.5 HG
> = 5100
> 2330 LHR020SD-00E2 MY77J8
> N
> SE
>
> MET
> A4SW METHOD
>
> E200.8 AG
> ND 1050
> 2331 LHR020SD-00E2 MY77J8
> N
> SE
>
> MET
> A4SW METHOD
>
> E200.8 AS
> =
> 5500
> 2332 LHR020SD-00E2 MY77J8
> N
> SE
>
> MET
> A4SW METHOD
>
> E200.8 B
> =
> 11400
> 2346 LHR020SD-00E2 MY77J8
> N
> SE
>
> MET
> A4SW SW3050B
> SW6010B
> V
> =
> 56900
> 2349 LHR020SD-00E2 MY77J8
> N
> SE
>
> MI
> A4SW METHOD
> A2540G
> SOLID =
> 47.7
>
> Desired output:
>
> fldsampid
> CLP_ID sacode
> matrix etc... CL
> PO4
> SO4 AG
> AL
> AS
> B
> V
> Zn
> etc... ALK
> ALKB ALKC
> SOLID DOC
> TOC NO3
>
> LHR020GW-01E2
>
> N
> WG
>
> <value>
> <value>
> <value>
>
> <value>
> <value>
> <value>
>
> <value>
> <value>
> <value>
>
> <value>
> <value>
> <value>
>
> <value>
> <value>
> <value>
>
> <value>
> <value>
>
> LHR020SD-00E2 MY77J8 N
> SE
>
> NA
> NA NA
> <value>
> <value>
>
> <value>
> <value>
> <value>
> NA
> <value>
> NA
> NA NA
> <value>
> NA
> NA
> NA
>
> [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help at r-project.org
> mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained,
> reproducible code.
>
More information about the R-help
mailing list