[R] Reading recurring data in a text file

Thu Jul 25 04:41:20 CEST 2019

Sorry, I was on my phone and did not see that you were already using but 
completely missing the vectorized nature of these functions.

Consider the following:

#############
# after executing your sample code
slowmethod <- function( whalines ) {
   lines   <- whalines
   mc_list <- NULL
   for (i in 1:length(lines)){
     # Look for start of water content
     if(grepl(srchStr1, lines[i])){
       mc_list <- c(mc_list, i)
     }
   }

   tmp_list <- NULL
   for (i in 1:length(lines)){
     # Look for start of temperature data
     if(grepl(srchStr2, lines[i])){
       tmp_list <- c(tmp_list, i)
     }
   }

   # Store the water content arrays
   wc <- list()
   # Read all the moisture content profiles
   for(i in 1:length(mc_list)){
     lineNum <- mc_list[i] + 3
     mct <- read.table( text = wha
                      , skip=lineNum
                      , nrows=5
                      , col.names=c('depth','wc')
                      )
     wc[[i]] <- mct
   }

   # Store the water temperature arrays
   tmp <- list()
   # Read all the temperature profiles
   for(i in 1:length(tmp_list)){
     lineNum <- tmp_list[i] + 3
     tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
                       col.names=c('depth','tmp'))
     tmp[[i]] <- tmpt
   }
   list(mc = wc, wt = tmp )
}

library(data.table)

fastmethod <- function( whalines ) {
   # identify tabular formatted lines
   idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$"
                    , whalines
                    , perl=TRUE )
   # identify groups of lines according to moisture content
   mc_block <- grep( srchStr1, whalines, fixed = TRUE )
   mc_start <- rep( FALSE, length( whalines ) )
   mc_start[ mc_block+4L ] <- TRUE
   grp_mc <- cumsum( mc_start )
   # identify contiguous block of tabular lines in each block
   mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum )
   # extract moisture content data
   mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE )
   mclist <- split( mc_dta, grp_mc[ mc_tab ] )

   # identify groups of lines according to moisture content
   wt_block <- grep( srchStr2, whalines, fixed = TRUE )
   wt_start <- rep( FALSE, length( whalines ) )
   wt_start[ wt_block+4L ] <- TRUE
   grp_wt <- cumsum( wt_start )
   # identify contiguous block of tabular lines in each block
   wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum )
   # extract data frames
   wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE )
   wtlist <- split( wt_dta, grp_wt[ wt_tab ] )
   list(mc = mclist, wt = wtlist )
}
library(microbenchmark)
bigwhalines <- strsplit( paste( rep( wha, 100 )
                               , collapse = "\n" )
                        , "\n" )[[ 1 ]]
microbenchmark( slowresult <- slowmethod( bigwhalines )
               , fastresult <- fastmethod( bigwhalines )
               )

On Wed, 24 Jul 2019, Jeff Newmiller wrote:

> ?readLines
> ?grep
> ?textConnection
>
> On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help" <r-help using r-project.org> wrote:
>> The small reproducible example below works, but is way too slow on the
>> real
>> problem.  The real problem is attempting to extract ~2920 repeated
>> arrays
>> from a 60 Mb file and takes ~80 minutes.  I'm wondering how I might
>> re-engineer the script to avoid opening and closing the file 2920 times
>> as
>> is the case now.  That is, is there a way to keep the file open and
>> peel
>> out the arrays and stuff them into a list of data.tables, as is done in
>> the
>> small reproducible example below, but in a significantly faster way?
>>
>> wha <- "     INITIAL PRESSURE HEAD
>>     INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C
>>     VS2DH - MedSand for TL test
>>
>>     TOTAL ELAPSED TIME =  0.000000E+00 sec
>>     TIME STEP         0
>>
>>     MOISTURE CONTENT
>>  Z, IN
>>  m                       X OR R DISTANCE, IN m
>>                0.500
>>     0.075     0.1475
>>     0.225     0.1475
>>     0.375     0.1475
>>     0.525     0.1475
>>     0.675     0.1475
>> blah
>> blah
>> blah
>>     TEMPERATURE, IN DECREES C
>>  Z, IN
>>  m                       X OR R DISTANCE, IN m
>>                0.500
>>     0.075     1.1475
>>     0.225     2.1475
>>     0.375     3.1475
>>     0.525     4.1475
>>     0.675     5.1475
>> blah
>> blah
>> blah
>>
>>     TOTAL ELAPSED TIME =  8.6400E+04 sec
>>     TIME STEP         0
>>
>>     MOISTURE CONTENT
>>  Z, IN
>>  m                       X OR R DISTANCE, IN m
>>                0.500
>>     0.075     0.1875
>>     0.225     0.1775
>>     0.375     0.1575
>>     0.525     0.1675
>>     0.675     0.1475
>> blah
>> blah
>> blah     TEMPERATURE, IN DECREES C
>>  Z, IN
>>  m                       X OR R DISTANCE, IN m
>>                0.500
>>     0.075     1.1475
>>     0.225     2.1475
>>     0.375     3.1475
>>     0.525     4.1475
>>     0.675     5.1475
>> blah
>> blah
>> blah"
>>
>> example_content <- textConnection(wha)
>>
>> srchStr1 <- '     MOISTURE CONTENT'
>> srchStr2 <- 'TEMPERATURE, IN DECREES C'
>>
>> lines   <- readLines(example_content)
>> mc_list <- NULL
>> for (i in 1:length(lines)){
>>  # Look for start of water content
>>  if(grepl(srchStr1, lines[i])){
>>    mc_list <- c(mc_list, i)
>>  }
>> }
>>
>> tmp_list <- NULL
>> for (i in 1:length(lines)){
>>  # Look for start of temperature data
>>  if(grepl(srchStr2, lines[i])){
>>    tmp_list <- c(tmp_list, i)
>>  }
>> }
>>
>> # Store the water content arrays
>> wc <- list()
>> # Read all the moisture content profiles
>> for(i in 1:length(mc_list)){
>>  lineNum <- mc_list[i] + 3
>>  mct <- read.table(text = wha, skip=lineNum, nrows=5,
>>                    col.names=c('depth','wc'))
>>  wc[[i]] <- mct
>> }
>>
>> # Store the water temperature arrays
>> tmp <- list()
>> # Read all the temperature profiles
>> for(i in 1:length(tmp_list)){
>>  lineNum <- tmp_list[i] + 3
>>  tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
>>                    col.names=c('depth','tmp'))
>>  tmp[[i]] <- tmpt
>> }
>>
>> # quick inspection
>> length(wc)
>> wc[[1]]
>> # Looks like what I'm after, but too slow in real world problem
>>
>> 	[[alternative HTML version deleted]]
>>
>> ______________________________________________
>> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>
> -- 
> Sent from my phone. Please excuse my brevity.
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil using dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                       Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k