library(LexFindR)
The LexFindR package implements R code to get various competitor types studied in psycholinguistics, including cohorts (get_cohorts), rhymes (get_rhymes), neighbors (get_neighbors), and words that embed within a target word (get_embeds_in_target) and words a target word embeds into (get_target_embeds_in).
The code uses regular expressions and balances speed and readability. By default, it is designed to handle complete pronunciation transcriptions (e.g., ARPAbet), in which pronunciations are coded in one or more ASCII characters separated by spaces. However, you can also use forms without delimiters, using the sep = "" argument when appropriate. As shown in the vignette, alternative transcriptions can be easily converted to the designed transcriptions.
# Install LexFindR from CRAN
install.packages("LexFindR")
# Or the development version from GitHub:
# install.packages("devtools")
::install_github("maglab-uconn/LexFindR") devtools
library(LexFindR)
# Get cohort index of ark in dictionary of ark, art and bab
<- "AA R K"
target <- c("AA R K", "AA R T", "B AA B")
lexicon
<- get_cohorts(target, lexicon)
cohort
cohort#> [1] 1 2
# To get forms rather than indices using base R
lexicon[cohort]#> [1] "AA R K" "AA R T"
# To get forms rather than indices using the form option
get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"
# Get count using base R
length(cohort)
#> [1] 2
# Get count using the count option
get_cohorts(target, lexicon, count = TRUE)
#> [1] 2
# Frequency weighting
<- 50
target_freq <- c(50, 274, 45)
lexicon_freq
# get the summed log frequencies of competitors
get_fw(lexicon_freq)
#> [1] 13.33181
#
get_fwcp(target_freq, lexicon_freq)
#> [1] 0.2934352
# By default, CMU has numbers that indicate stress patterns
#
# If you do not strip those out, instances of the same vowel
# with different stress numbers will be treated as different
# symbols. This may be useful for some purposes (e.g., finding
# cohorts or neighbors with the same stress pattern).
#
# Here is a contrived example, where ARK will not be considered
# related to ART or BARK because of stress pattern differences
<- "AA0 R K"
target <- c("AA0 R K", "AA2 R T", "B AA3 R K")
lexicon
get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA0 R K"
get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA0 R K"
# If this is not the behavior we want, we can strip lexical
# stress indicators using regular expressions
<- gsub("\\d", "", target)
target <- gsub("\\d", "", lexicon)
lexicon
print(target)
#> [1] "AA R K"
print(lexicon)
#> [1] "AA R K" "AA R T" "B AA R K"
get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"
get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T" "B AA R K"
This example shows how to do multiple steps at once.
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
#> ✓ tibble 3.1.2 ✓ dplyr 1.0.7
#> ✓ tidyr 1.1.3 ✓ stringr 1.4.0
#> ✓ readr 1.4.0 ✓ forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
glimpse(slex)
#> Rows: 212
#> Columns: 3
#> $ Item <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
# define the lexicon with the list of target words to compute
# cohorts for; we will use *target_df* instead of modifying
# slex or lemmalex directly
<- slex
target_df
# specify the reference lexicon; here it is actually the list
# of pronunciations from slex, as we want to find all cohorts
# for all words in our lexicon. It is not necessary to create
# a new dataframe, but because we find it useful for more
# complex tasks, we use this approach here
<- target_df
lexicon_df
# this instruction will create a new column in our target_df
# dataframe, "cohort_idx", which will be the list of lexicon_df
# indices corresponding to each word's cohort set
$cohort_idx <-
target_dflapply(
# in each lapply instance, select the target pronunciation
$Pronunciation,
target_df# in each lapply instance, apply the get_cohorts function
FUN = get_cohorts,
# in each lapply instance, compare the current target
# Pronunciation to each lexicon Pronunciation
lexicon = lexicon_df$Pronunciation
)
# let's look at the first few instances in each field...
glimpse(target_df)
#> Rows: 212
#> Columns: 4
#> $ Item <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ cohort_idx <list> 1, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, …
tidyverse piping style is more readable.
<- slex %>% mutate(
slex_rhymes rhyme_idx = lapply(Pronunciation, get_rhymes, lexicon = Pronunciation),
rhyme_str = lapply(rhyme_idx, function(idx) {
Item[idx]
}),rhyme_count = lengths(rhyme_idx)
)
glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 6
#> $ Item <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…
<- slex_rhymes %>%
slex_rhymes rowwise() %>%
mutate(
rhyme_freq = list(slex$Frequency[rhyme_idx]),
rhyme_fw = get_fw(rhyme_freq),
rhyme_fwcp = get_fwcp(Frequency, rhyme_freq)
%>%
) ungroup()
glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 9
#> $ Item <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…
#> $ rhyme_freq <list> <53, 332, 29>, <4406, 125, 386, 10, 20>, <50, 125, 234,…
#> $ rhyme_fw <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.718499, 7.…
#> $ rhyme_fwcp <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0000000, 0…
library(future.apply)
library(tictoc)
# using two cores for demo or else
# set `workers` to availableCores() to use all cores
plan(multisession, workers = 2)
glimpse(lemmalex)
#> Rows: 17,750
#> Columns: 3
#> $ Item <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…
# the portion between tic and toc below takes ~X seconds on a
# 15-inch Macbook Pro 6-core i9; if you replace future_lapply
# with lapply, it takes ~317 secs, v. 66 secs with future_lapply
tic("Finding rhymes")
<- lemmalex %>% mutate(
slex_rhyme_lemmalex rhyme = future_lapply(Pronunciation, get_rhymes,
lexicon = lemmalex$Pronunciation),
rhyme_str = lapply(rhyme, function(idx) {
$Item[idx]
lemmalex
}),rhyme_len = lengths(rhyme)
)
toc()
#> Finding rhymes: 119.952 sec elapsed
glimpse(slex_rhyme_lemmalex)
#> Rows: 17,750
#> Columns: 6
#> $ Item <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…
#> $ rhyme <list> <1, 8846, 15769>, 2, 3, <4, 1136>, <5, 1092, 1285, 1331…
#> $ rhyme_str <list> <"a", "le", "the">, "abandon", "abandonment", <"abate",…
#> $ rhyme_len <int> 3, 1, 1, 2, 5, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1,…
This extended example is from a paper describing LexFindR to be submitted in Fall, 2020.
library(LexFindR)
library(tidyverse) # for glimpse
library(future.apply) # parallelization
library(tictoc) # timing utilities
# In this example, we define a dataframe source for target words
# (target_df) and another for the lexicon to compare the target
# words to (lexicon_df). Often, these will be the same, but we keep
# them separate here to make it easier for others to generalize from
# this example code.
# Code assumes you have at least 3 columns in target_df & lexicon_df:
# 1. Item -- a label of some sort, can be identical to Pronunciation
# 2. Pronunciation -- typically a phonological form
# 3. Frequency -- should be in occurrences per million, or some other
# raw form, as the functions below take the log of
# the frequency form. See advice about padding in
# the main article text.
#
# Of course, you can name your fields as you like, and edit the
# field names below appropriately.
<- slex
target_df <- target_df
lexicon_df
# Prepare for parallelizing
# 1. how many cores do we have?
# num_cores <- availableCores()
# using two cores for demo
<- 2
num_cores
print(paste0("Using num_cores: ", num_cores))
#> [1] "Using num_cores: 2"
# 2. now let future.apply figure out how to optimize parallel
# division of labor over cores
plan(multisession, workers = num_cores)
# the functions in this list all return lists of word indices; the
# uniqueness point function is not included because it returns a
# single value per word.
<- c(
fun_list "cohorts", "neighbors",
"rhymes", "homoforms",
"target_embeds_in", "embeds_in_target",
"nohorts", "cohortsP", "neighborsP",
"target_embeds_inP", "embeds_in_targetP"
)
# we need to keep track of the P variants, as we need to tell get_fwcp
# to add in the target frequency for these, as they exclude the target
<- c(
Ps "cohortsP", "neighborsP", "target_embeds_inP",
"embeds_in_targetP"
)
# determine how much to pad based on minimum frequency
if (min(target_df$Frequency) == 0) {
<- 2
pad else if (min(target_df$Frequency) < 1) {
} <- 1
pad else {
} <- 0
pad
}
# now let's loop through the functions
for (fun_name in fun_list) {
# start timer for this function
tic(fun_name)
# the P functions do not include the target in the denominator for
# get_fwcp; if we want this to be a consistent ratio, we need to
# add target frequency to the denominator
<- FALSE
add_target if (fun_name %in% Ps) {
<- TRUE
add_target
}
# inform the user that we are starting the next function, make sure
# we are correctly adding target or not
cat("Starting", fun_name, " -- add_target = ", add_target)
<- paste0("get_", fun_name)
func
# use *future_lapply* to do the competitor search, creating
# a new column in *target_df* that will be this function's
# name + _idx (e.g., cohort_idx)
paste0(fun_name, "_idx")]] <-
target_df[[future_lapply(target_df$Pronunciation,
FUN = get(func),
lexicon = lexicon_df$Pronunciation
)
# list the competitor form labels in functionname_str
paste0(fun_name, "_str")]] <- lapply(
target_df[[paste0(fun_name, "_idx")]],
target_df[[function(idx) {
$Item[idx]
lexicon_df
}
)
# list the competitor frequencies in functionname_freq
paste0(fun_name, "_freq")]] <- lapply(
target_df[[paste0(fun_name, "_idx")]],
target_df[[function(idx) {
$Frequency[idx]
lexicon_df
}
)
# put the count of competitors in functionname_num
paste0(fun_name, "_num")]] <-
target_df[[lengths(target_df[[paste0(fun_name, "_idx")]])
# put the FW in functionname_fwt
paste0(fun_name, "_fwt")]] <-
target_df[[mapply(get_fw,
competitors_freq = target_df[[paste0(fun_name, "_freq")]],
pad = pad
)
# put the FWCP in functionname_fwcp
paste0(fun_name, "_fwcp")]] <-
target_df[[mapply(get_fwcp,
target_freq = target_df$Frequency,
competitors_freq = target_df[[paste0(fun_name, "_freq")]],
pad = pad, add_target = add_target
)
toc()
}#> Starting cohorts -- add_target = FALSEcohorts: 0.16 sec elapsed
#> Starting neighbors -- add_target = FALSEneighbors: 0.101 sec elapsed
#> Starting rhymes -- add_target = FALSErhymes: 0.068 sec elapsed
#> Starting homoforms -- add_target = FALSEhomoforms: 0.054 sec elapsed
#> Starting target_embeds_in -- add_target = FALSEtarget_embeds_in: 0.058 sec elapsed
#> Starting embeds_in_target -- add_target = FALSEembeds_in_target: 0.079 sec elapsed
#> Starting nohorts -- add_target = FALSEnohorts: 0.117 sec elapsed
#> Starting cohortsP -- add_target = TRUEcohortsP: 0.119 sec elapsed
#> Starting neighborsP -- add_target = TRUEneighborsP: 0.157 sec elapsed
#> Starting target_embeds_inP -- add_target = TRUEtarget_embeds_inP: 0.18 sec elapsed
#> Starting embeds_in_targetP -- add_target = TRUEembeds_in_targetP: 0.186 sec elapsed
# Note that get_neighborsP excludes rhymes. If you do not want to
# track rhymes separately and want neighborsP to include all
# rhymes that are not cohorts, you can create new fields that
# combine them, as we do here, creating "Pr" versions
$neighborsPr_num = target_df$neighborsP_num + target_df$rhymes_num
target_df$neighborsPr_fwcp = target_df$neighborsP_fwcp + target_df$rhymes_fwcp
target_df$neighborsPr_fwt = target_df$neighborsP_fwt + target_df$rhymes_fwt
target_df
# Now let's streamline the dataframe; we'll select the num, fwt, and fwcp
# columns and put them in that order, while not keeping some of the other
# 'helper' columns we created
<- target_df %>%
export_df select(Item | Pronunciation | Frequency
| ends_with("_num") | ends_with("_fwt") | ends_with("_fwcp"))
glimpse(export_df)
#> Rows: 212
#> Columns: 39
#> $ Item <chr> "ad", "ar", "ark", "art", "art^st", "bab", "bab…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH …
#> $ Frequency <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 1…
#> $ cohorts_num <int> 1, 4, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,…
#> $ neighbors_num <int> 4, 8, 6, 5, 1, 4, 4, 2, 1, 7, 5, 1, 7, 5, 8, 3,…
#> $ rhymes_num <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3,…
#> $ homoforms_num <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_num <int> 6, 29, 5, 9, 1, 2, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1…
#> $ embeds_in_target_num <int> 1, 1, 2, 2, 5, 1, 3, 2, 1, 2, 4, 2, 1, 3, 3, 2,…
#> $ nohorts_num <int> 1, 3, 3, 3, 1, 3, 3, 2, 1, 3, 2, 1, 2, 2, 3, 1,…
#> $ cohortsP_num <int> 0, 1, 1, 1, 3, 4, 4, 5, 6, 4, 5, 6, 1, 1, 0, 2,…
#> $ neighborsP_num <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,…
#> $ target_embeds_inP_num <int> 3, 21, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
#> $ embeds_in_targetP_num <int> 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
#> $ neighborsPr_num <int> 4, 6, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 4, 6, 3,…
#> $ cohorts_fwt <dbl> 3.970292, 22.634373, 22.634373, 22.634373, 22.6…
#> $ neighbors_fwt <dbl> 21.533445, 37.968634, 33.688446, 27.349358, 4.7…
#> $ rhymes_fwt <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.7…
#> $ homoforms_fwt <dbl> 3.970292, 8.390723, 3.912023, 5.613128, 4.71849…
#> $ target_embeds_in_fwt <dbl> 29.792782, 127.685319, 22.680328, 42.517044, 4.…
#> $ embeds_in_target_fwt <dbl> 3.970292, 8.390723, 12.302746, 14.003851, 35.28…
#> $ nohorts_fwt <dbl> 3.970292, 17.915874, 17.915874, 17.915874, 4.71…
#> $ cohortsP_fwt <dbl> 0.000000, 4.718499, 4.718499, 4.718499, 17.9158…
#> $ neighborsP_fwt <dbl> 8.390723, 3.970292, 0.000000, 0.000000, 0.00000…
#> $ target_embeds_inP_fwt <dbl> 16.650059, 88.968478, 2.995732, 22.751933, 0.00…
#> $ embeds_in_targetP_fwt <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 16.5642…
#> $ neighborsPr_fwt <dbl> 21.533445, 28.443483, 19.684596, 15.046612, 4.7…
#> $ cohorts_fwcp <dbl> 1.00000000, 0.37070710, 0.17283550, 0.24799133,…
#> $ neighbors_fwcp <dbl> 0.1843779, 0.2209909, 0.1161236, 0.2052380, 1.0…
#> $ rhymes_fwcp <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0…
#> $ homoforms_fwcp <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_fwcp <dbl> 0.13326355, 0.06571407, 0.17248529, 0.13202066,…
#> $ embeds_in_target_fwcp <dbl> 1.0000000, 1.0000000, 0.3179797, 0.4008275, 0.1…
#> $ nohorts_fwcp <dbl> 1.0000000, 0.4683401, 0.2183551, 0.3133047, 1.0…
#> $ cohortsP_fwcp <dbl> 1.0000000, 0.6400626, 0.4532777, 0.5432957, 0.2…
#> $ neighborsP_fwcp <dbl> 0.3211947, 0.6788053, 1.0000000, 1.0000000, 1.0…
#> $ target_embeds_inP_fwcp <dbl> 0.19254240, 0.08618315, 0.56632333, 0.19788881,…
#> $ embeds_in_targetP_fwcp <dbl> 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.2…
#> $ neighborsPr_fwcp <dbl> 0.6232852, 1.0216590, 1.1987352, 1.3730493, 2.0…