[R] Reading recurring data in a text file

Ista Zahn |@t@z@hn @end|ng |rom gm@||@com
Fri Jul 26 18:50:09 CEST 2019


Here is another possibility

library(stringr)

readterm <- function(term, text) {
    lapply(str_split(text, fixed(term))[[1]][-1],
           fread, skip = 4, nrows = 5
           )
}

easymethod <- function(whalines) {
    whalines <- str_c(whalines, collapse = "\n")
    lapply(c(srchStr1, srchStr2),
           readterm, text = whalines
    )
}


## perhaps a tad slower, but I find it easier to follow.

library(microbenchmark)
bigwhalines <- strsplit( paste( rep( wha, 100 )
                               , collapse = "\n" )
                        , "\n" )[[ 1 ]]
microbenchmark(slowresult <- slowmethod( bigwhalines ),
               fastresult <- fastmethod( bigwhalines ),
               easyresult <- easymethod( bigwhalines )
               )



On Wed, Jul 24, 2019 at 10:41 PM Jeff Newmiller
<jdnewmil using dcn.davis.ca.us> wrote:
>
> Sorry, I was on my phone and did not see that you were already using but
> completely missing the vectorized nature of these functions.
>
> Consider the following:
>
> #############
> # after executing your sample code
> slowmethod <- function( whalines ) {
>    lines   <- whalines
>    mc_list <- NULL
>    for (i in 1:length(lines)){
>      # Look for start of water content
>      if(grepl(srchStr1, lines[i])){
>        mc_list <- c(mc_list, i)
>      }
>    }
>
>    tmp_list <- NULL
>    for (i in 1:length(lines)){
>      # Look for start of temperature data
>      if(grepl(srchStr2, lines[i])){
>        tmp_list <- c(tmp_list, i)
>      }
>    }
>
>    # Store the water content arrays
>    wc <- list()
>    # Read all the moisture content profiles
>    for(i in 1:length(mc_list)){
>      lineNum <- mc_list[i] + 3
>      mct <- read.table( text = wha
>                       , skip=lineNum
>                       , nrows=5
>                       , col.names=c('depth','wc')
>                       )
>      wc[[i]] <- mct
>    }
>
>    # Store the water temperature arrays
>    tmp <- list()
>    # Read all the temperature profiles
>    for(i in 1:length(tmp_list)){
>      lineNum <- tmp_list[i] + 3
>      tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
>                        col.names=c('depth','tmp'))
>      tmp[[i]] <- tmpt
>    }
>    list(mc = wc, wt = tmp )
> }
>
> library(data.table)
>
> fastmethod <- function( whalines ) {
>    # identify tabular formatted lines
>    idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$"
>                     , whalines
>                     , perl=TRUE )
>    # identify groups of lines according to moisture content
>    mc_block <- grep( srchStr1, whalines, fixed = TRUE )
>    mc_start <- rep( FALSE, length( whalines ) )
>    mc_start[ mc_block+4L ] <- TRUE
>    grp_mc <- cumsum( mc_start )
>    # identify contiguous block of tabular lines in each block
>    mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum )
>    # extract moisture content data
>    mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE )
>    mclist <- split( mc_dta, grp_mc[ mc_tab ] )
>
>    # identify groups of lines according to moisture content
>    wt_block <- grep( srchStr2, whalines, fixed = TRUE )
>    wt_start <- rep( FALSE, length( whalines ) )
>    wt_start[ wt_block+4L ] <- TRUE
>    grp_wt <- cumsum( wt_start )
>    # identify contiguous block of tabular lines in each block
>    wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum )
>    # extract data frames
>    wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE )
>    wtlist <- split( wt_dta, grp_wt[ wt_tab ] )
>    list(mc = mclist, wt = wtlist )
> }
> library(microbenchmark)
> bigwhalines <- strsplit( paste( rep( wha, 100 )
>                                , collapse = "\n" )
>                         , "\n" )[[ 1 ]]
> microbenchmark( slowresult <- slowmethod( bigwhalines )
>                , fastresult <- fastmethod( bigwhalines )
>                )
>
>
> On Wed, 24 Jul 2019, Jeff Newmiller wrote:
>
> > ?readLines
> > ?grep
> > ?textConnection
> >
> > On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help" <r-help using r-project.org> wrote:
> >> The small reproducible example below works, but is way too slow on the
> >> real
> >> problem.  The real problem is attempting to extract ~2920 repeated
> >> arrays
> >> from a 60 Mb file and takes ~80 minutes.  I'm wondering how I might
> >> re-engineer the script to avoid opening and closing the file 2920 times
> >> as
> >> is the case now.  That is, is there a way to keep the file open and
> >> peel
> >> out the arrays and stuff them into a list of data.tables, as is done in
> >> the
> >> small reproducible example below, but in a significantly faster way?
> >>
> >> wha <- "     INITIAL PRESSURE HEAD
> >>     INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C
> >>     VS2DH - MedSand for TL test
> >>
> >>     TOTAL ELAPSED TIME =  0.000000E+00 sec
> >>     TIME STEP         0
> >>
> >>     MOISTURE CONTENT
> >>  Z, IN
> >>  m                       X OR R DISTANCE, IN m
> >>                0.500
> >>     0.075     0.1475
> >>     0.225     0.1475
> >>     0.375     0.1475
> >>     0.525     0.1475
> >>     0.675     0.1475
> >> blah
> >> blah
> >> blah
> >>     TEMPERATURE, IN DECREES C
> >>  Z, IN
> >>  m                       X OR R DISTANCE, IN m
> >>                0.500
> >>     0.075     1.1475
> >>     0.225     2.1475
> >>     0.375     3.1475
> >>     0.525     4.1475
> >>     0.675     5.1475
> >> blah
> >> blah
> >> blah
> >>
> >>     TOTAL ELAPSED TIME =  8.6400E+04 sec
> >>     TIME STEP         0
> >>
> >>     MOISTURE CONTENT
> >>  Z, IN
> >>  m                       X OR R DISTANCE, IN m
> >>                0.500
> >>     0.075     0.1875
> >>     0.225     0.1775
> >>     0.375     0.1575
> >>     0.525     0.1675
> >>     0.675     0.1475
> >> blah
> >> blah
> >> blah     TEMPERATURE, IN DECREES C
> >>  Z, IN
> >>  m                       X OR R DISTANCE, IN m
> >>                0.500
> >>     0.075     1.1475
> >>     0.225     2.1475
> >>     0.375     3.1475
> >>     0.525     4.1475
> >>     0.675     5.1475
> >> blah
> >> blah
> >> blah"
> >>
> >> example_content <- textConnection(wha)
> >>
> >> srchStr1 <- '     MOISTURE CONTENT'
> >> srchStr2 <- 'TEMPERATURE, IN DECREES C'
> >>
> >> lines   <- readLines(example_content)
> >> mc_list <- NULL
> >> for (i in 1:length(lines)){
> >>  # Look for start of water content
> >>  if(grepl(srchStr1, lines[i])){
> >>    mc_list <- c(mc_list, i)
> >>  }
> >> }
> >>
> >> tmp_list <- NULL
> >> for (i in 1:length(lines)){
> >>  # Look for start of temperature data
> >>  if(grepl(srchStr2, lines[i])){
> >>    tmp_list <- c(tmp_list, i)
> >>  }
> >> }
> >>
> >> # Store the water content arrays
> >> wc <- list()
> >> # Read all the moisture content profiles
> >> for(i in 1:length(mc_list)){
> >>  lineNum <- mc_list[i] + 3
> >>  mct <- read.table(text = wha, skip=lineNum, nrows=5,
> >>                    col.names=c('depth','wc'))
> >>  wc[[i]] <- mct
> >> }
> >>
> >> # Store the water temperature arrays
> >> tmp <- list()
> >> # Read all the temperature profiles
> >> for(i in 1:length(tmp_list)){
> >>  lineNum <- tmp_list[i] + 3
> >>  tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
> >>                    col.names=c('depth','tmp'))
> >>  tmp[[i]] <- tmpt
> >> }
> >>
> >> # quick inspection
> >> length(wc)
> >> wc[[1]]
> >> # Looks like what I'm after, but too slow in real world problem
> >>
> >>      [[alternative HTML version deleted]]
> >>
> >> ______________________________________________
> >> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> >> https://stat.ethz.ch/mailman/listinfo/r-help
> >> PLEASE do read the posting guide
> >> http://www.R-project.org/posting-guide.html
> >> and provide commented, minimal, self-contained, reproducible code.
> >
> > --
> > Sent from my phone. Please excuse my brevity.
> >
> > ______________________________________________
> > R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> > https://stat.ethz.ch/mailman/listinfo/r-help
> > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > and provide commented, minimal, self-contained, reproducible code.
> >
>
> ---------------------------------------------------------------------------
> Jeff Newmiller                        The     .....       .....  Go Live...
> DCN:<jdnewmil using dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
>                                        Live:   OO#.. Dead: OO#..  Playing
> Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
> /Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.



More information about the R-help mailing list