library(petro.One)
library(tm)
library(tibble)
use_example(1)
p1 <- onepetro_page_to_dataframe("1000_conference.html")
p2 <- onepetro_page_to_dataframe("2000_conference.html")
p3 <- onepetro_page_to_dataframe("3000_conference.html")
nn_papers <- rbind(p1, p2, p3)
nn_papers## # A tibble: 2,918 x 6
##                                                    title_data
##                                                         <chr>
##  1                                    Neural Networks And AVO
##  2                        Deconvolution Using Neural Networks
##  3                   Neural Network Stacking Velocity Picking
##  4           Hydrocarbon Prediction Using Dual Neural Network
##  5      Higher-Order Neural Networks in Petroleum Engineering
##  6  Multiple Attenuation With Attribute-based Neural Networks
##  7   Conductive fracture identification using neural networks
##  8           APPLYING NEURAL NETWORK IN HYDROTREATING PROCESS
##  9                 Bit Bounce Detection Using Neural Networks
## 10 Artificial Neural Networks in Drilling Troubles Prediction
## # ... with 2,908 more rows, and 5 more variables: paper_id <chr>,
## #   source <chr>, type <chr>, year <int>, author1_data <chr>vdocs <- VCorpus(VectorSource(nn_papers$title_data))
vdocs <- tm_map(vdocs, content_transformer(tolower))      # to lowercase
vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove stopwordstdm <- TermDocumentMatrix(vdocs)
tdm.matrix <- as.matrix(tdm)
tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE)
tdm.df <- data.frame(word = names(tdm.rs), freq = tdm.rs, stringsAsFactors = FALSE)
as.tibble(tdm.df)                          # prevent long printing of dataframe## # A tibble: 5,145 x 2
##          word  freq
##  *      <chr> <dbl>
##  1      using   666
##  2     neural   520
##  3  reservoir   499
##  4       data   348
##  5    seismic   291
##  6    network   288
##  7 artificial   283
##  8   analysis   249
##  9 prediction   245
## 10   networks   227
## # ... with 5,135 more rowsThere are 5145 words under analysis. We will focus our attention on those papers were the frequency is greater than 50 occurrances.
library(wordcloud)
set.seed(1234)
wordcloud(words = tdm.df$word, freq = tdm.df$freq, min.freq = 50,
          max.words=200, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))Note that in the word cloud there are words of common use such as using, use, new, approach and case. These words are not necessarily technical enough to improve where the papers we are analyzing are focusing. In the next example, we will build our own custom stopwords to prevent these words from showing.