[R] Reading recurring data in a text file
Ista Zahn
|@t@z@hn @end|ng |rom gm@||@com
Fri Jul 26 18:50:09 CEST 2019
Here is another possibility
library(stringr)
readterm <- function(term, text) {
lapply(str_split(text, fixed(term))[[1]][-1],
fread, skip = 4, nrows = 5
)
}
easymethod <- function(whalines) {
whalines <- str_c(whalines, collapse = "\n")
lapply(c(srchStr1, srchStr2),
readterm, text = whalines
)
}
## perhaps a tad slower, but I find it easier to follow.
library(microbenchmark)
bigwhalines <- strsplit( paste( rep( wha, 100 )
, collapse = "\n" )
, "\n" )[[ 1 ]]
microbenchmark(slowresult <- slowmethod( bigwhalines ),
fastresult <- fastmethod( bigwhalines ),
easyresult <- easymethod( bigwhalines )
)
On Wed, Jul 24, 2019 at 10:41 PM Jeff Newmiller
<jdnewmil using dcn.davis.ca.us> wrote:
>
> Sorry, I was on my phone and did not see that you were already using but
> completely missing the vectorized nature of these functions.
>
> Consider the following:
>
> #############
> # after executing your sample code
> slowmethod <- function( whalines ) {
> lines <- whalines
> mc_list <- NULL
> for (i in 1:length(lines)){
> # Look for start of water content
> if(grepl(srchStr1, lines[i])){
> mc_list <- c(mc_list, i)
> }
> }
>
> tmp_list <- NULL
> for (i in 1:length(lines)){
> # Look for start of temperature data
> if(grepl(srchStr2, lines[i])){
> tmp_list <- c(tmp_list, i)
> }
> }
>
> # Store the water content arrays
> wc <- list()
> # Read all the moisture content profiles
> for(i in 1:length(mc_list)){
> lineNum <- mc_list[i] + 3
> mct <- read.table( text = wha
> , skip=lineNum
> , nrows=5
> , col.names=c('depth','wc')
> )
> wc[[i]] <- mct
> }
>
> # Store the water temperature arrays
> tmp <- list()
> # Read all the temperature profiles
> for(i in 1:length(tmp_list)){
> lineNum <- tmp_list[i] + 3
> tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
> col.names=c('depth','tmp'))
> tmp[[i]] <- tmpt
> }
> list(mc = wc, wt = tmp )
> }
>
> library(data.table)
>
> fastmethod <- function( whalines ) {
> # identify tabular formatted lines
> idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$"
> , whalines
> , perl=TRUE )
> # identify groups of lines according to moisture content
> mc_block <- grep( srchStr1, whalines, fixed = TRUE )
> mc_start <- rep( FALSE, length( whalines ) )
> mc_start[ mc_block+4L ] <- TRUE
> grp_mc <- cumsum( mc_start )
> # identify contiguous block of tabular lines in each block
> mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum )
> # extract moisture content data
> mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE )
> mclist <- split( mc_dta, grp_mc[ mc_tab ] )
>
> # identify groups of lines according to moisture content
> wt_block <- grep( srchStr2, whalines, fixed = TRUE )
> wt_start <- rep( FALSE, length( whalines ) )
> wt_start[ wt_block+4L ] <- TRUE
> grp_wt <- cumsum( wt_start )
> # identify contiguous block of tabular lines in each block
> wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum )
> # extract data frames
> wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE )
> wtlist <- split( wt_dta, grp_wt[ wt_tab ] )
> list(mc = mclist, wt = wtlist )
> }
> library(microbenchmark)
> bigwhalines <- strsplit( paste( rep( wha, 100 )
> , collapse = "\n" )
> , "\n" )[[ 1 ]]
> microbenchmark( slowresult <- slowmethod( bigwhalines )
> , fastresult <- fastmethod( bigwhalines )
> )
>
>
> On Wed, 24 Jul 2019, Jeff Newmiller wrote:
>
> > ?readLines
> > ?grep
> > ?textConnection
> >
> > On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help" <r-help using r-project.org> wrote:
> >> The small reproducible example below works, but is way too slow on the
> >> real
> >> problem. The real problem is attempting to extract ~2920 repeated
> >> arrays
> >> from a 60 Mb file and takes ~80 minutes. I'm wondering how I might
> >> re-engineer the script to avoid opening and closing the file 2920 times
> >> as
> >> is the case now. That is, is there a way to keep the file open and
> >> peel
> >> out the arrays and stuff them into a list of data.tables, as is done in
> >> the
> >> small reproducible example below, but in a significantly faster way?
> >>
> >> wha <- " INITIAL PRESSURE HEAD
> >> INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C
> >> VS2DH - MedSand for TL test
> >>
> >> TOTAL ELAPSED TIME = 0.000000E+00 sec
> >> TIME STEP 0
> >>
> >> MOISTURE CONTENT
> >> Z, IN
> >> m X OR R DISTANCE, IN m
> >> 0.500
> >> 0.075 0.1475
> >> 0.225 0.1475
> >> 0.375 0.1475
> >> 0.525 0.1475
> >> 0.675 0.1475
> >> blah
> >> blah
> >> blah
> >> TEMPERATURE, IN DECREES C
> >> Z, IN
> >> m X OR R DISTANCE, IN m
> >> 0.500
> >> 0.075 1.1475
> >> 0.225 2.1475
> >> 0.375 3.1475
> >> 0.525 4.1475
> >> 0.675 5.1475
> >> blah
> >> blah
> >> blah
> >>
> >> TOTAL ELAPSED TIME = 8.6400E+04 sec
> >> TIME STEP 0
> >>
> >> MOISTURE CONTENT
> >> Z, IN
> >> m X OR R DISTANCE, IN m
> >> 0.500
> >> 0.075 0.1875
> >> 0.225 0.1775
> >> 0.375 0.1575
> >> 0.525 0.1675
> >> 0.675 0.1475
> >> blah
> >> blah
> >> blah TEMPERATURE, IN DECREES C
> >> Z, IN
> >> m X OR R DISTANCE, IN m
> >> 0.500
> >> 0.075 1.1475
> >> 0.225 2.1475
> >> 0.375 3.1475
> >> 0.525 4.1475
> >> 0.675 5.1475
> >> blah
> >> blah
> >> blah"
> >>
> >> example_content <- textConnection(wha)
> >>
> >> srchStr1 <- ' MOISTURE CONTENT'
> >> srchStr2 <- 'TEMPERATURE, IN DECREES C'
> >>
> >> lines <- readLines(example_content)
> >> mc_list <- NULL
> >> for (i in 1:length(lines)){
> >> # Look for start of water content
> >> if(grepl(srchStr1, lines[i])){
> >> mc_list <- c(mc_list, i)
> >> }
> >> }
> >>
> >> tmp_list <- NULL
> >> for (i in 1:length(lines)){
> >> # Look for start of temperature data
> >> if(grepl(srchStr2, lines[i])){
> >> tmp_list <- c(tmp_list, i)
> >> }
> >> }
> >>
> >> # Store the water content arrays
> >> wc <- list()
> >> # Read all the moisture content profiles
> >> for(i in 1:length(mc_list)){
> >> lineNum <- mc_list[i] + 3
> >> mct <- read.table(text = wha, skip=lineNum, nrows=5,
> >> col.names=c('depth','wc'))
> >> wc[[i]] <- mct
> >> }
> >>
> >> # Store the water temperature arrays
> >> tmp <- list()
> >> # Read all the temperature profiles
> >> for(i in 1:length(tmp_list)){
> >> lineNum <- tmp_list[i] + 3
> >> tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
> >> col.names=c('depth','tmp'))
> >> tmp[[i]] <- tmpt
> >> }
> >>
> >> # quick inspection
> >> length(wc)
> >> wc[[1]]
> >> # Looks like what I'm after, but too slow in real world problem
> >>
> >> [[alternative HTML version deleted]]
> >>
> >> ______________________________________________
> >> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> >> https://stat.ethz.ch/mailman/listinfo/r-help
> >> PLEASE do read the posting guide
> >> http://www.R-project.org/posting-guide.html
> >> and provide commented, minimal, self-contained, reproducible code.
> >
> > --
> > Sent from my phone. Please excuse my brevity.
> >
> > ______________________________________________
> > R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> > https://stat.ethz.ch/mailman/listinfo/r-help
> > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > and provide commented, minimal, self-contained, reproducible code.
> >
>
> ---------------------------------------------------------------------------
> Jeff Newmiller The ..... ..... Go Live...
> DCN:<jdnewmil using dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
> Live: OO#.. Dead: OO#.. Playing
> Research Engineer (Solar/Batteries O.O#. #.O#. with
> /Software/Embedded Controllers) .OO#. .OO#. rocks...1k
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
More information about the R-help
mailing list