── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'googledrive'
The following objects are masked from 'package:googlesheets4':
request_generate, request_make
R notebook for the corpus-based study of English suffixes -ance and -ancy
Overview
As of the 28th of May 2025, the article, titled “Productivity and Distribution of Nominal Suffixes -ance and -ancy in the Corpus of Contemporary American English” is under review in LET: Linguistics, Literature and English Teaching Journal.
Data preparation
# df <- read_sheet(ss = datasheet) <- run this regularly to check update
# write_rds(df, "data-raw/ance-ancy.rds")
<- read_rds("data-raw/ance-ancy.rds")
df <- read_tsv("data-raw/unanalysed.csv") |>
unanalysed mutate(SUFFIX = replace_na(SUFFIX, ""))
Rows: 279 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): FORM, ROOT, SUFFIX, ETYMOLOGY, REMARK
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- unanalysed |>
unanalysed1 filter(!is.na(ROOT), SUFFIX != "-mancy") |>
filter(FORM != "CIRCUMSTANCE") |> # historical relation with circum + stance
mutate(SUFFIX = replace(SUFFIX, SUFFIX == "", "ancy")) |>
select(FORM, ROOT_new = ROOT, SUFFIX_new = SUFFIX, ETYMOLOGY_new = ETYMOLOGY)
# df_combined <- df |>
# left_join(unanalysed1,
# by = join_by(FORM))|>
# mutate(ROOT = if_else(!is.na(ROOT_new),
# ROOT_new,
# ROOT),
# ETYMOLOGY = if_else(!is.na(ETYMOLOGY_new),
# ETYMOLOGY_new,
# ETYMOLOGY),
# SUFFIX = if_else(!is.na(SUFFIX_new),
# SUFFIX_new,
# SUFFIX)) |>
# select(!matches("_new"))
<- df df_combined
Try the current analysed data
<- filter(df_combined, !is.na(ROOT))
df_ok # df_ok <- filter(df, !is.na(ROOT))
|> slice_sample(n = 10) df_ok
# A tibble: 10 × 7
ID FORM FREQ GENRE SUFFIX ROOT ETYMOLOGY
<dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
1 28 MENDICANCY 44 ACADEMIC ancy mendicant mendicancy is formed wi…
2 3 PERFORMANCE 12767 BLOG ance perform performance is formed w…
3 86 BIBLIOMANCY 1 NEWSPAPER ancy bibliomancy bibliomancy is formed w…
4 50 RELEVANCE 456 NEWSPAPER ance relevant relevance is formed wit…
5 15 INERRANCY 123 WEBSITE ancy inerrant inerrancy is formed wit…
6 58 RESEMBLANCE 222 SPOKEN ance resemble resemblance is a borrow…
7 77 SEMBLANCE 240 MAGAZINE ance semblance semblance is a borrowin…
8 58 NUISANCE 358 NEWSPAPER ance nuisant nuisance is a borrowing…
9 36 INCONSTANCY 3 TV/MOVIES ancy inconstant inconstancy is a borrow…
10 61 DEFIANCE 416 MAGAZINE ance defiant defiance is a borrowing…
Productivity analysis
Run productivity analysis per suffix (Table 1).
# drive_create("productivity-by-affix",
# path = as_id("1e_0dwXEH_qTTzUdDGNxI_LHQSFIS98qj"),
# type = "spreadsheet")
# Created Drive file:
# • productivity-by-affix <id: 1g9fw5PVUrSiR9TGA9tQlOBPektaZJaVMqBK7_wwAIOs>
# With MIME type:
# • application/vnd.google-apps.spreadsheet
<- df_ok |>
prod_by_affix
# remove GENRE to re-calculate frequency of suffixes and their hapax
select(-GENRE) |>
# to sum token frequency
group_by(FORM, SUFFIX, ROOT) |>
summarise(n_token = sum(FREQ), .groups = "drop") |>
# determine the hapax
mutate(is_hapax = if_else(n_token == 1, TRUE, FALSE)) |>
# to run productivity analysis
group_by(SUFFIX) |>
summarise(n_type = n_distinct(FORM),
n_token = sum(n_token),
n_hapax = sum(is_hapax),
hapax_per_token_ratio = n_hapax/n_token)
|>
prod_by_affix ::kable(col.names = c("Suffix", "Type Freq.", "Token Freq.", "No. Hapax", "Hapax per Token Ratio"))
knitr# prod_by_affix |>
# rename(Suffix = SUFFIX,
# `Type Freq.` = n_type,
# `Token Freq.` = n_token,
# `No. of Hapax` = n_hapax,
# `Hapax per token ratio` = hapax_per_token_ratio) |>
# write_sheet(ss = "1g9fw5PVUrSiR9TGA9tQlOBPektaZJaVMqBK7_wwAIOs",
# sheet = "Sheet1")
write_tsv(prod_by_affix, file = "data-out/productivity_overall_by_affix.tsv")
Suffix | Type Freq. | Token Freq. | No. Hapax | Hapax per Token Ratio |
---|---|---|---|---|
ance | 97 | 823125 | 0 | 0.0000000 |
ancy | 103 | 48601 | 11 | 0.0002263 |
Run productivity analysis by genres using the type frequency measure (Figure 1).
<- df_ok |>
ance filter(SUFFIX == "ance")
<- ance |>
ance_word_freq group_by(FORM) |>
summarise(FREQ = sum(FREQ))
<- df_ok |>
ancy filter(SUFFIX == "ancy")
<- ancy |>
ancy_word_freq group_by(FORM) |>
summarise(FREQ = sum(FREQ))
<- ance |>
ance_prod group_by(GENRE) |>
# determine the hapax in a given genre
mutate(is_hapax = if_else(FREQ == 1, TRUE, FALSE)) |>
summarise(n_type = n_distinct(FORM),
n_token = sum(FREQ),
n_hapax = sum(is_hapax)) |>
ungroup() |>
mutate(suffix = "ancy")
<- ancy |>
ancy_prod group_by(GENRE) |>
# determine the hapax in a given genre
mutate(is_hapax = if_else(FREQ == 1, TRUE, FALSE)) |>
summarise(n_type = n_distinct(FORM),
n_token = sum(FREQ),
n_hapax = sum(is_hapax)) |>
ungroup() |>
mutate(suffix = "ance")
<- bind_rows(ance_prod, ancy_prod) genre_prod
|>
genre_prod mutate(suffix = str_c("-", suffix, sep = "")) |>
ggplot(aes(x = GENRE, y = n_type, fill = suffix)) +
geom_col(position = position_dodge(width = .9)) +
coord_flip() +
theme_light(base_family = "serif") +
labs(y = "Type frequency",
fill = "Suffix",
x = "Genre") +
theme(legend.text = element_text(size = 13),
legend.title = element_text(size = 17),
axis.title.y = element_text(size = 17),
axis.title.x = element_text(size = 17),
axis.text.x = element_text(size = 13)) +
scale_fill_discrete(breaks = c("-ancy", "-ance"))
# ggsave("figures/prod-by-genre.png", width = 6.5, height = 4.5, dpi = 300,
# units = "in")
# googledrive::drive_upload("figures/prod-by-genre.png",
# path = as_id("18ppX_-Nr-dI-iQojYC5WHpGZi_dKojhA"),
# name = "productivity-by-genre.png")

Interchangeability of the base/root
Check the shared and distinct bases (Table 2). This is operationalised via stripping off the -ance and -ancy strings from the FORM column.
<- ance_word_freq |>
ance_form mutate(BASE = str_replace(FORM, "ANCE$", ""),
suffix = "ance")
<- ancy_word_freq |>
ancy_form mutate(BASE = str_replace(FORM, "ANCY$", ""),
suffix = "ancy")
<- bind_rows(ance_form, ancy_form)
all_ance_ancy
<- unique(c(ance_form$BASE, ancy_form$BASE))
all_bases
<- intersect(ance_form$BASE, ancy_form$BASE)
shared_ance_ancy <- round((length(shared_ance_ancy)/length(all_bases)) * 100, 2)
shared_ance_ancy_prop
<- setdiff(ance_form$BASE, ancy_form$BASE)
only_ance <- round((length(only_ance)/length(all_bases)) * 100, 2)
only_ance_prop
<- setdiff(ancy_form$BASE, ance_form$BASE)
only_ancy <- round((length(only_ancy)/length(all_bases)) * 100, 2) only_ancy_prop
Out of the total 183 bases, only 9.29% (i.e., 17 bases) are shared (i.e., appear with -ance and -ancy) but the frequency of occurrence of these shared bases with the suffixes are not equal.
<- all_ance_ancy |>
all_ance_ancy_tb filter(BASE %in% shared_ance_ancy) |>
select(-FORM) |>
pivot_wider(names_from = "suffix", values_from = "FREQ") |>
mutate(BASE = str_c(BASE, "ANC(E/Y)", sep = "")) |>
arrange(desc(ancy))
write_tsv(all_ance_ancy_tb, "data-out/all_ance_ancy_tb.tsv")
|>
all_ance_ancy_tb arrange(BASE) |>
::kable() knitr
Binomial test
<- all_ance_ancy_tb |>
all_ance_ancy_tb_binom mutate(binom = map2(ance,
ancy, ~binom.test(.x,
sum(c(.x, .y)),
5))) |>
.mutate(binom = map(binom, ~tidy(.))) |>
unnest_wider(binom) |>
mutate(p_holm = p.adjust(p.value, method = "holm")) |>
arrange(BASE) |>
mutate(across(matches("(^estimate|^conf)"), ~round(., digits = 4)))
|>
all_ance_ancy_tb_binom ::kable()
knitrwrite_tsv(all_ance_ancy_tb_binom, file = "data-out/interchangeable-binom-test.tsv")
BASE | ance | ancy | estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative | p_holm |
---|---|---|---|---|---|---|---|---|---|---|---|
ARROGANC(E/Y) | 5049 | 5 | 0.9990 | 5049 | 0 | 5054 | 0.9977 | 0.9997 | Exact binomial test | two.sided | 0 |
BRILLIANC(E/Y) | 3203 | 86 | 0.9739 | 3203 | 0 | 3289 | 0.9678 | 0.9790 | Exact binomial test | two.sided | 0 |
COMPLIANC(E/Y) | 12987 | 9 | 0.9993 | 12987 | 0 | 12996 | 0.9987 | 0.9997 | Exact binomial test | two.sided | 0 |
CONSTANC(E/Y) | 1974 | 594 | 0.7687 | 1974 | 0 | 2568 | 0.7519 | 0.7849 | Exact binomial test | two.sided | 0 |
CONTINUANC(E/Y) | 805 | 6 | 0.9926 | 805 | 0 | 811 | 0.9840 | 0.9973 | Exact binomial test | two.sided | 0 |
DEVIANC(E/Y) | 323 | 112 | 0.7425 | 323 | 0 | 435 | 0.6987 | 0.7830 | Exact binomial test | two.sided | 0 |
EXTRAVAGANC(E/Y) | 458 | 2 | 0.9957 | 458 | 0 | 460 | 0.9844 | 0.9995 | Exact binomial test | two.sided | 0 |
FRAGRANC(E/Y) | 3055 | 3 | 0.9990 | 3055 | 0 | 3058 | 0.9971 | 0.9998 | Exact binomial test | two.sided | 0 |
IGNORANC(E/Y) | 13310 | 1 | 0.9999 | 13310 | 0 | 13311 | 0.9996 | 1.0000 | Exact binomial test | two.sided | 0 |
IRRELEVANC(E/Y) | 624 | 171 | 0.7849 | 624 | 0 | 795 | 0.7547 | 0.8130 | Exact binomial test | two.sided | 0 |
PREDOMINANC(E/Y) | 417 | 2 | 0.9952 | 417 | 0 | 419 | 0.9829 | 0.9994 | Exact binomial test | two.sided | 0 |
RADIANC(E/Y) | 1075 | 2 | 0.9981 | 1075 | 0 | 1077 | 0.9933 | 0.9998 | Exact binomial test | two.sided | 0 |
RELEVANC(E/Y) | 8094 | 589 | 0.9322 | 8094 | 0 | 8683 | 0.9267 | 0.9374 | Exact binomial test | two.sided | 0 |
RELUCTANC(E/Y) | 4560 | 3 | 0.9993 | 4560 | 0 | 4563 | 0.9981 | 0.9999 | Exact binomial test | two.sided | 0 |
RESISTANC(E/Y) | 32578 | 1 | 1.0000 | 32578 | 0 | 32579 | 0.9998 | 1.0000 | Exact binomial test | two.sided | 0 |
RESONANC(E/Y) | 3919 | 3 | 0.9992 | 3919 | 0 | 3922 | 0.9978 | 0.9998 | Exact binomial test | two.sided | 0 |
SIGNIFICANC(E/Y) | 21828 | 6 | 0.9997 | 21828 | 0 | 21834 | 0.9994 | 0.9999 | Exact binomial test | two.sided | 0 |
Reuse
Citation
BibTeX citation:
@software{budiani2025,
author = {Budiani, Ni Ketut and Rajeg, Gede Primahadi Wijaya and
Netra, I Made},
title = {R Notebook for the Corpus-Based Study of {English} Suffixes
-\emph{Ance} and -\emph{Ancy}},
date = {2025-03-18},
url = {https://complexico.github.io/ance-ancy-suffix/},
doi = {10.17605/OSF.IO/7YJ49},
langid = {en}
}
For attribution, please cite this work as:
Budiani, Ni Ketut, Gede Primahadi Wijaya Rajeg, and I Made Netra. 2025.
“R Notebook for the Corpus-Based Study of English Suffixes
-Ance and -Ancy.” https://doi.org/10.17605/OSF.IO/7YJ49.