Getting started

library(LexFindR)

# Get cohort index of ark in dictionary of ark, art and bab
target <- "AA R K"
lexicon <- c("AA R K", "AA R T", "B AA B")

cohort <- get_cohorts(target, lexicon)
cohort
#> [1] 1 2


# To get forms rather than indices using base R
lexicon[cohort]
#> [1] "AA R K" "AA R T"


# To get forms rather than indices using the form option
get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"


# Get count using base R
length(cohort)
#> [1] 2


# Get count using the count option
get_cohorts(target, lexicon, count = TRUE)
#> [1] 2


# Frequency weighting
target_freq <- 50
lexicon_freq <- c(50, 274, 45)

# get the summed log frequencies of competitors
get_fw(lexicon_freq)
#> [1] 13.33181


# 
get_fwcp(target_freq, lexicon_freq)
#> [1] 0.2934352

NOTE: On using ARPABET from the CMU Pronouncing Dictionary

# By default, CMU has numbers that indicate stress patterns
# 
# If you do not strip those out, instances of the same vowel
# with different stress numbers will be treated as different
# symbols. This may be useful for some purposes (e.g., finding
# cohorts or neighbors with the same stress pattern).
# 
# Here is a contrived example, where ARK will not be considered
# related to ART or BARK because of stress pattern differences
target <- "AA0 R K"
lexicon <- c("AA0 R K", "AA2 R T", "B AA3 R K")

get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA0 R K"

get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA0 R K"


# If this is not the behavior we want, we can strip lexical 
# stress indicators using regular expressions
target <- gsub("\\d", "", target)
lexicon <- gsub("\\d", "", lexicon)

print(target)
#> [1] "AA R K"

print(lexicon)
#> [1] "AA R K"   "AA R T"   "B AA R K"


get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"

get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA R K"   "AA R T"   "B AA R K"

EXAMPLE: cohorts with the TRACE slex lexicon

This example shows how to do multiple steps at once.

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

glimpse(slex)
#> Rows: 212
#> Columns: 3
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…


# define the lexicon with the list of target words to compute
# cohorts for; we will use *target_df* instead of modifying
# slex or lemmalex directly
target_df <- slex

# specify the reference lexicon; here it is actually the list
# of pronunciations from slex, as we want to find all cohorts
# for all words in our lexicon. It is not necessary to create
# a new dataframe, but because we find it useful for more
# complex tasks, we use this approach here
lexicon_df <- target_df

# this instruction will create a new column in our target_df
# dataframe, "cohort_idx", which will be the list of lexicon_df
# indices corresponding to each word's cohort set
target_df$cohort_idx <-
  lapply(
    # in each lapply instance, select the target pronunciation
    target_df$Pronunciation,
    # in each lapply instance, apply the get_cohorts function
    FUN = get_cohorts,
    # in each lapply instance, compare the current target 
    # Pronunciation to each lexicon Pronunciation
    lexicon = lexicon_df$Pronunciation
  )

# let's look at the first few instances in each field...
glimpse(target_df)
#> Rows: 212
#> Columns: 4
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ cohort_idx    <list> 1, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, …

EXAMPLE: rhymes with the TRACE slex lexicon and tidyverse piping style

tidyverse piping style is more readable.

slex_rhymes <- slex %>% mutate(
  rhyme_idx = lapply(Pronunciation, get_rhymes, lexicon = Pronunciation),
  rhyme_str = lapply(rhyme_idx, function(idx) {
    Item[idx]
  }),
  rhyme_count = lengths(rhyme_idx)
)

glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 6
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx     <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str     <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count   <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…


slex_rhymes <- slex_rhymes %>%
  rowwise() %>%
  mutate(
    rhyme_freq = list(slex$Frequency[rhyme_idx]),
    rhyme_fw = get_fw(rhyme_freq),
    rhyme_fwcp = get_fwcp(Frequency, rhyme_freq)
  ) %>% 
  ungroup()

glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 9
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx     <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str     <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count   <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…
#> $ rhyme_freq    <list> <53, 332, 29>, <4406, 125, 386, 10, 20>, <50, 125, 234,…
#> $ rhyme_fw      <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.718499, 7.…
#> $ rhyme_fwcp    <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0000000, 0…

EXAMPLE: Using parallelization

library(future.apply)
library(tictoc)

# using two cores for demo or else 
# set `workers` to availableCores() to use all cores
plan(multisession, workers = 2)

glimpse(lemmalex)
#> Rows: 17,750
#> Columns: 3
#> $ Item          <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency     <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…



# the portion between tic and toc below takes ~X seconds on a 
# 15-inch Macbook Pro 6-core i9; if you replace future_lapply 
# with lapply, it takes ~317 secs, v. 66 secs with future_lapply

tic("Finding rhymes")

slex_rhyme_lemmalex <- lemmalex %>% mutate(
  rhyme = future_lapply(Pronunciation, get_rhymes, 
                            lexicon = lemmalex$Pronunciation),
  rhyme_str = lapply(rhyme, function(idx) {
    lemmalex$Item[idx]
  }),
  rhyme_len = lengths(rhyme)
)

toc()
#> Finding rhymes: 37.793 sec elapsed


glimpse(slex_rhyme_lemmalex)
#> Rows: 17,750
#> Columns: 6
#> $ Item          <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency     <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…
#> $ rhyme         <list> <1, 8846, 15769>, 2, 3, <4, 1136>, <5, 1092, 1285, 1331…
#> $ rhyme_str     <list> <"a", "le", "the">, "abandon", "abandonment", <"abate",…
#> $ rhyme_len     <int> 3, 1, 1, 2, 5, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1,…

EXAMPLE: Putting it all together

This extended example is from a paper describing LexFindR to be submitted in Fall, 2020.

library(LexFindR)
library(tidyverse) # for glimpse
library(future.apply) # parallelization
library(tictoc) # timing utilities

# In this example, we define a dataframe source for target words
# (target_df) and another for the lexicon to compare the target
# words to (lexicon_df). Often, these will be the same, but we keep
# them separate here to make it easier for others to generalize from
# this example code.

# Code assumes you have at least 3 columns in target_df & lexicon_df:
# 1. Item -- a label of some sort, can be identical to Pronunciation
# 2. Pronunciation -- typically a phonological form
# 3. Frequency -- should be in occurrences per million, or some other
#                 raw form, as the functions below take the log of
#                 the frequency form. See advice about padding in
#                 the main article text.
#
# Of course, you can name your fields as you like, and edit the
# field names below appropriately.
target_df <- slex
lexicon_df <- target_df

# Prepare for parallelizing
# 1. how many cores do we have?
# num_cores <- availableCores()

# using two cores for demo
num_cores <- 2

print(paste0("Using num_cores: ", num_cores))
#> [1] "Using num_cores: 2"

# 2. now let future.apply figure out how to optimize parallel
#    division of labor over cores
plan(multisession, workers = num_cores)

# the functions in this list all return lists of word indices; the
# uniqueness point function is not included because it returns a
# single value per word.
fun_list <- c(
  "cohorts", "neighbors",
  "rhymes", "homoforms",
  "target_embeds_in", "embeds_in_target",
  "nohorts", "cohortsP", "neighborsP",
  "target_embeds_inP", "embeds_in_targetP"
)

# we need to keep track of the P variants, as we need to tell get_fwcp
# to add in the target frequency for these, as they exclude the target
Ps <- c(
  "cohortsP", "neighborsP", "target_embeds_inP",
  "embeds_in_targetP"
)

# determine how much to pad based on minimum frequency
if (min(target_df$Frequency) == 0) {
  pad <- 2
} else if (min(target_df$Frequency) < 1) {
  pad <- 1
} else {
  pad <- 0
}

# now let's loop through the functions
for (fun_name in fun_list) {
  # start timer for this function
  tic(fun_name)

  # the P functions do not include the target in the denominator for
  # get_fwcp; if we want this to be a consistent ratio, we need to
  # add target frequency to the denominator
  add_target <- FALSE
  if (fun_name %in% Ps) {
    add_target <- TRUE
  }

  # inform the user that we are starting the next function, make sure
  # we are correctly adding target or not
  cat("Starting", fun_name, " -- add_target = ", add_target)
  func <- paste0("get_", fun_name)

  # use *future_lapply* to do the competitor search, creating
  # a new column in *target_df* that will be this function's
  # name + _idx (e.g., cohort_idx)
  target_df[[paste0(fun_name, "_idx")]] <-
    future_lapply(target_df$Pronunciation,
      FUN = get(func),
      lexicon = lexicon_df$Pronunciation
    )

  # list the competitor form labels in functionname_str
  target_df[[paste0(fun_name, "_str")]] <- lapply(
    target_df[[paste0(fun_name, "_idx")]],
    function(idx) {
      lexicon_df$Item[idx]
    }
  )

  # list the competitor frequencies in functionname_freq
  target_df[[paste0(fun_name, "_freq")]] <- lapply(
    target_df[[paste0(fun_name, "_idx")]],
    function(idx) {
      lexicon_df$Frequency[idx]
    }
  )

  # put the count of competitors in functionname_num
  target_df[[paste0(fun_name, "_num")]] <-
    lengths(target_df[[paste0(fun_name, "_idx")]])

  # put the FW in functionname_fwt
  target_df[[paste0(fun_name, "_fwt")]] <-
    mapply(get_fw,
      competitors_freq = target_df[[paste0(fun_name, "_freq")]],
      pad = pad
    )

  # put the FWCP in functionname_fwcp
  target_df[[paste0(fun_name, "_fwcp")]] <-
    mapply(get_fwcp,
      target_freq = target_df$Frequency,
      competitors_freq = target_df[[paste0(fun_name, "_freq")]],
      pad = pad, add_target = add_target
    )

  toc()
}
#> Starting cohorts  -- add_target =  FALSEcohorts: 0.084 sec elapsed
#> Starting neighbors  -- add_target =  FALSEneighbors: 0.067 sec elapsed
#> Starting rhymes  -- add_target =  FALSErhymes: 0.043 sec elapsed
#> Starting homoforms  -- add_target =  FALSEhomoforms: 0.035 sec elapsed
#> Starting target_embeds_in  -- add_target =  FALSEtarget_embeds_in: 0.039 sec elapsed
#> Starting embeds_in_target  -- add_target =  FALSEembeds_in_target: 0.042 sec elapsed
#> Starting nohorts  -- add_target =  FALSEnohorts: 0.059 sec elapsed
#> Starting cohortsP  -- add_target =  TRUEcohortsP: 0.06 sec elapsed
#> Starting neighborsP  -- add_target =  TRUEneighborsP: 0.073 sec elapsed
#> Starting target_embeds_inP  -- add_target =  TRUEtarget_embeds_inP: 0.099 sec elapsed
#> Starting embeds_in_targetP  -- add_target =  TRUEembeds_in_targetP: 0.079 sec elapsed


# Note that get_neighborsP excludes rhymes. If you do not want to 
# track rhymes separately and want neighborsP to include all 
# rhymes that are not cohorts, you can create new fields that 
# combine them, as we do here, creating "Pr" versions
target_df$neighborsPr_num = target_df$neighborsP_num + target_df$rhymes_num
target_df$neighborsPr_fwcp = target_df$neighborsP_fwcp + target_df$rhymes_fwcp
target_df$neighborsPr_fwt = target_df$neighborsP_fwt + target_df$rhymes_fwt

# Now let's streamline the dataframe; we'll select the num, fwt, and fwcp
# columns and put them in that order, while not keeping some of the other
# 'helper' columns we created

export_df <- target_df %>% 
  select(Item | Pronunciation | Frequency 
         | ends_with("_num") | ends_with("_fwt") | ends_with("_fwcp"))

glimpse(export_df)
#> Rows: 212
#> Columns: 39
#> $ Item                   <chr> "ad", "ar", "ark", "art", "art^st", "bab", "bab…
#> $ Pronunciation          <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH …
#> $ Frequency              <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 1…
#> $ cohorts_num            <int> 1, 4, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,…
#> $ neighbors_num          <int> 4, 8, 6, 5, 1, 4, 4, 2, 1, 7, 5, 1, 7, 5, 8, 3,…
#> $ rhymes_num             <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3,…
#> $ homoforms_num          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_num   <int> 6, 29, 5, 9, 1, 2, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1…
#> $ embeds_in_target_num   <int> 1, 1, 2, 2, 5, 1, 3, 2, 1, 2, 4, 2, 1, 3, 3, 2,…
#> $ nohorts_num            <int> 1, 3, 3, 3, 1, 3, 3, 2, 1, 3, 2, 1, 2, 2, 3, 1,…
#> $ cohortsP_num           <int> 0, 1, 1, 1, 3, 4, 4, 5, 6, 4, 5, 6, 1, 1, 0, 2,…
#> $ neighborsP_num         <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,…
#> $ target_embeds_inP_num  <int> 3, 21, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
#> $ embeds_in_targetP_num  <int> 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
#> $ neighborsPr_num        <int> 4, 6, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 4, 6, 3,…
#> $ cohorts_fwt            <dbl> 3.970292, 22.634373, 22.634373, 22.634373, 22.6…
#> $ neighbors_fwt          <dbl> 21.533445, 37.968634, 33.688446, 27.349358, 4.7…
#> $ rhymes_fwt             <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.7…
#> $ homoforms_fwt          <dbl> 3.970292, 8.390723, 3.912023, 5.613128, 4.71849…
#> $ target_embeds_in_fwt   <dbl> 29.792782, 127.685319, 22.680328, 42.517044, 4.…
#> $ embeds_in_target_fwt   <dbl> 3.970292, 8.390723, 12.302746, 14.003851, 35.28…
#> $ nohorts_fwt            <dbl> 3.970292, 17.915874, 17.915874, 17.915874, 4.71…
#> $ cohortsP_fwt           <dbl> 0.000000, 4.718499, 4.718499, 4.718499, 17.9158…
#> $ neighborsP_fwt         <dbl> 8.390723, 3.970292, 0.000000, 0.000000, 0.00000…
#> $ target_embeds_inP_fwt  <dbl> 16.650059, 88.968478, 2.995732, 22.751933, 0.00…
#> $ embeds_in_targetP_fwt  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 16.5642…
#> $ neighborsPr_fwt        <dbl> 21.533445, 28.443483, 19.684596, 15.046612, 4.7…
#> $ cohorts_fwcp           <dbl> 1.00000000, 0.37070710, 0.17283550, 0.24799133,…
#> $ neighbors_fwcp         <dbl> 0.1843779, 0.2209909, 0.1161236, 0.2052380, 1.0…
#> $ rhymes_fwcp            <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0…
#> $ homoforms_fwcp         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_fwcp  <dbl> 0.13326355, 0.06571407, 0.17248529, 0.13202066,…
#> $ embeds_in_target_fwcp  <dbl> 1.0000000, 1.0000000, 0.3179797, 0.4008275, 0.1…
#> $ nohorts_fwcp           <dbl> 1.0000000, 0.4683401, 0.2183551, 0.3133047, 1.0…
#> $ cohortsP_fwcp          <dbl> 1.0000000, 0.6400626, 0.4532777, 0.5432957, 0.2…
#> $ neighborsP_fwcp        <dbl> 0.3211947, 0.6788053, 1.0000000, 1.0000000, 1.0…
#> $ target_embeds_inP_fwcp <dbl> 0.19254240, 0.08618315, 0.56632333, 0.19788881,…
#> $ embeds_in_targetP_fwcp <dbl> 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.2…
#> $ neighborsPr_fwcp       <dbl> 0.6232852, 1.0216590, 1.1987352, 1.3730493, 2.0…

LexFindR vignette

ZhaoBin Li, Anne Marie Crinnion, and James S. Magnuson

2024-06-16

Overview

Installation

Getting started

NOTE: On using ARPABET from the CMU Pronouncing Dictionary

EXAMPLE: cohorts with the TRACE slex lexicon

EXAMPLE: rhymes with the TRACE slex lexicon and tidyverse piping style

EXAMPLE: Using parallelization

EXAMPLE: Putting it all together