From 5f674b2cfbeb229b2c8d15abceb0950d40c40ad7 Mon Sep 17 00:00:00 2001 From: benjessup-samy Date: Mon, 2 Feb 2026 11:47:49 +0000 Subject: [PATCH 1/6] phone tagging function creates a flag column for whether a post contains a phone number, can also replace phone numbers with text. Aggressive mode (default) tags consequetive digits (7-15) too. --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/limpiar_phones.R | 155 +++++++++++++++++++++++++++ README.Rmd | 1 + README.md | 1 + _pkgdown.yml | 1 + man/limpiar_phones.Rd | 75 +++++++++++++ tests/testthat/test-limpiar_phones.R | 144 +++++++++++++++++++++++++ 8 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 R/limpiar_phones.R create mode 100644 man/limpiar_phones.Rd create mode 100644 tests/testthat/test-limpiar_phones.R diff --git a/DESCRIPTION b/DESCRIPTION index ae79cb3..da6b643 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: dplyr (>= 1.0.10), rlang (>= 1.0.6), diff --git a/NAMESPACE b/NAMESPACE index 86d390d..81c758f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(limpiar_link_click) export(limpiar_link_click_reverse) export(limpiar_na_cols) export(limpiar_non_ascii) +export(limpiar_phones) export(limpiar_pos_annotate) export(limpiar_pos_import_model) export(limpiar_pp_companies) diff --git a/R/limpiar_phones.R b/R/limpiar_phones.R new file mode 100644 index 0000000..7e3d7b1 --- /dev/null +++ b/R/limpiar_phones.R @@ -0,0 +1,155 @@ +#' Clean phone numbers from text +#' +#' Function creates a flag column for posts containing phone numbers. +#' Catches various phone number formats, i.e US, UK, European etc. +#' By default the function only replaces phone numbers in a recognised format. +#' Can also be set to be more aggressive and catch plain digit sequences (7-15 digits). +#' Can also be set to replace phone_numbers with a string. +#' +#' @param df Name of DataFrame or Tibble object +#' @param text_var Name of text variable/character vector +#' @param aggressive Bool: If TRUE, also catches plain digit sequences (7-15 digits) +#' @param tag String: Default = "None", if supplied replaces phone numbers with string +#' +#' @return The DataFrame or Tibble object with phone number flag column +#' +#' @details +#' Matches: +#' \itemize{ +#' \item International: +1 555-123-4567, +44 20 1234 5678 +#' \item US/Canada: (555) 123-4567, 555-123-4567 +#' \item UK: 07951 902 146, 01786 475545 +#' \item European: 77 54 33 33 +#' \item Latin American: 4782-0699 +#' \item Local: 555-1234 +#' } +#' Also matches when aggressive = TRUE: +#' \itemize{ +#' \item Plain digits: 07546104638, 1234567890 +#' } +#' +#' Avoids matching: +#' \itemize{ +#' \item 09:00-17:00 +#' \item 192.168.1.1 +#' \item $1,234,567 +#' \item 1,000,000,000 +#' \item 1995-2025 +#' } +#' +#' @examples +#' # Example data +#' phone_examples <- tibble::tibble( +#' id = 1:5, +#' text_var = c( +#' "Call me at 555-123-4567 or (555) 123-4568", +#' "WhatsApp +44 20 1234 5678", +#' "Contact: 07506308688", +#' "Meeting at 09:00-17:00, call 4782-0699", +#' "I earned £100,000,000 between 1995-2025" +#' ) +#' ) +#' +#' # Default example +#' phone_examples %>% +#' limpiar_phones(text_var = text_var) %>% +#' dplyr::select(text_var) +#' +#' # More aggressive version, catching sequences of digits between 7-15 in length +#' phone_examples %>% +#' limpiar_phones(text_var = text_var, aggressive = TRUE) %>% +#' dplyr::select(text_var) +#' +#' @export +#' +limpiar_phones <- function(df, + text_var = mention_content, + aggressive = TRUE, + tag = "None") { + + # handle both quoted and unquoted column names + text_var <- rlang::ensym(text_var) + + # check text var is correct type + col_data <- dplyr::pull(df, {{ text_var }}) + + if (!is.character(col_data)) { + stop("Parameter 'text_var' must be a character vector (string type), but got type: ", + class(col_data)[1]) + } + + # phone number format patterns + patterns <- c( + # international with + prefix + # matches +[1-3 digits][7-12 more digits with optional separators] + "\\+\\d{1,3}(?:[\\s.-]?\\d){7,12}", + + # us/canada format with parentheses + # matches (XXX) XXX-XXXX or (XXX) XXX XXXX + "\\(\\d{3}\\)[\\s.-]?\\d{3}[\\s.-]?\\d{4}", + + # standard 10-digit with separators + # matches XXX-XXX-XXXX or XXX XXX XXXX or XXX.XXX.XXXX + "\\d{3}[\\s.-]\\d{3}[\\s.-]\\d{4}", + + # uk format (5-3-3) + # matches 0XXXX XXX XXX + "0\\d{4}\\s\\d{3}\\s\\d{3,6}", + + # uk format (5-6 with single space) + # matches 0XXXX XXXXXX + "0\\d{4}\\s\\d{6}", + + # european short format (8 digits with spaces) + # matches XX XX XX XX (exactly 8 digits) + "(?% + dplyr::mutate( + phone_number_flag = stringr::str_detect(!!text_var, full_pattern) + ) + } else { + # create flag column and replace phone numbers with tag + df <- df %>% + dplyr::mutate( + phone_number_flag = stringr::str_detect(!!text_var, full_pattern), + !!text_var := stringr::str_replace_all(!!text_var, full_pattern, tag) + ) + } + + return(df) +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 86fc087..788a985 100644 --- a/README.Rmd +++ b/README.Rmd @@ -65,6 +65,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | +| [limpiar_phones](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/README.md b/README.md index 45e7b56..ffaaab3 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | +| [limpiar_phones](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/_pkgdown.yml b/_pkgdown.yml index 14f6566..70f99da 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -94,6 +94,7 @@ reference: - limpiar_duplicates - limpiar_retweets - limpiar_spam_grams + - limpiar_phones - title: Utility Functions desc: > diff --git a/man/limpiar_phones.Rd b/man/limpiar_phones.Rd new file mode 100644 index 0000000..354fb8d --- /dev/null +++ b/man/limpiar_phones.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_phones.R +\name{limpiar_phones} +\alias{limpiar_phones} +\title{Clean phone numbers from text} +\usage{ +limpiar_phones(df, text_var = mention_content, aggressive = TRUE, tag = "None") +} +\arguments{ +\item{df}{Name of DataFrame or Tibble object} + +\item{text_var}{Name of text variable/character vector} + +\item{aggressive}{Bool: If TRUE, also catches plain digit sequences (7-15 digits)} + +\item{tag}{String: Default = "None", if supplied replaces phone numbers with string} +} +\value{ +The DataFrame or Tibble object with phone number flag column +} +\description{ +Function creates a flag column for posts containing phone numbers. +Catches various phone number formats, i.e US, UK, European etc. +By default the function only replaces phone numbers in a recognised format. +Can also be set to be more aggressive and catch plain digit sequences (7-15 digits). +Can also be set to replace phone_numbers with a string. +} +\details{ +Matches: +\itemize{ +\item International: +1 555-123-4567, +44 20 1234 5678 +\item US/Canada: (555) 123-4567, 555-123-4567 +\item UK: 07951 902 146, 01786 475545 +\item European: 77 54 33 33 +\item Latin American: 4782-0699 +\item Local: 555-1234 +} +Also matches when aggressive = TRUE: +\itemize{ +\item Plain digits: 07546104638, 1234567890 +} + +Avoids matching: +\itemize{ +\item 09:00-17:00 +\item 192.168.1.1 +\item $1,234,567 +\item 1,000,000,000 +\item 1995-2025 +} +} +\examples{ +# Example data +phone_examples <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025" + ) +) + +# Default example +phone_examples \%>\% + limpiar_phones(text_var = text_var) \%>\% + dplyr::select(text_var) + +# More aggressive version, catching sequences of digits between 7-15 in length +phone_examples \%>\% + limpiar_phones(text_var = text_var, aggressive = TRUE) \%>\% + dplyr::select(text_var) + +} diff --git a/tests/testthat/test-limpiar_phones.R b/tests/testthat/test-limpiar_phones.R new file mode 100644 index 0000000..961bdc1 --- /dev/null +++ b/tests/testthat/test-limpiar_phones.R @@ -0,0 +1,144 @@ +input_example_1 <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025") +) + +output_example_1a <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at phone_number or phone_number", + "WhatsApp phone_number", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call phone_number", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + FALSE, + TRUE, + FALSE + ) +) + +output_example_1b <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at phone_number or phone_number", + "WhatsApp phone_number", + "Contact: phone_number", + "Meeting at 09:00-17:00, call phone_number", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + TRUE, + TRUE, + FALSE + ) +) + +output_example_1c <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + FALSE, + TRUE, + FALSE + ) +) + +output_example_1d <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + TRUE, + TRUE, + FALSE + ) +) + +# Tests +test_that("Example test", { + # test example behaviour + output <- limpiar_phones( + input_example_1, + text_var = text_var, + aggressive = FALSE, + tag = "phone_number" + ); + expect_equal(output, output_example_1a); +}) + +test_that("Agressive test", { + # test aggressive works as expected + output <- limpiar_phones( + input_example_1, + text_var = text_var, + aggressive = TRUE, + tag = "phone_number" + ); + + # Test that the result is the correct value + expect_equal(output, output_example_1b); +}) + +test_that("No tag test", { + # Test that no-tag behaviour is expected + output <- limpiar_phones( + input_example_1, + text_var = text_var + ); + expect_equal(output, output_example_1d); +}) + +test_that("Data input test", { + # Test that data exists + expect_error(limpiar_phones( + "string", + text_var = text_var, + aggressive = FALSE, + tag = "phone_number", + remove = FALSE + )); +}) + +test_that("text_var input test", { + # Test that text_var exists as a column + expect_error(limpiar_phones( + input_example_1, + text_var = "not_a_column", + aggressive = FALSE, + tag = "phone_number", + remove = FALSE + )); +}) + +test_that("Agressive input test", { + # Test that aggressive must be a bool + expect_error(limpiar_phones( + input_example_1, + text_var = text_var, + aggressive = "string", + tag = "phone_number", + remove = FALSE + )); +}) \ No newline at end of file From 0273fc3c41a62ecd8d103a019f4b4bc4060d7b63 Mon Sep 17 00:00:00 2001 From: benjessup-samy Date: Mon, 2 Feb 2026 12:19:19 +0000 Subject: [PATCH 2/6] Update test-limpiar_phones.R added test for string input of text_var --- tests/testthat/test-limpiar_phones.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/testthat/test-limpiar_phones.R b/tests/testthat/test-limpiar_phones.R index 961bdc1..3484446 100644 --- a/tests/testthat/test-limpiar_phones.R +++ b/tests/testthat/test-limpiar_phones.R @@ -141,4 +141,15 @@ test_that("Agressive input test", { tag = "phone_number", remove = FALSE )); +}) + +test_that("text input as string test", { + # test example behaviour + output <- limpiar_phones( + input_example_1, + text_var = "text_var", + aggressive = FALSE, + tag = "phone_number" + ); + expect_equal(output, output_example_1a); }) \ No newline at end of file From 1672a309c3e1cf9b617273b3e41adc8143c7d088 Mon Sep 17 00:00:00 2001 From: benjessup-samy Date: Mon, 2 Feb 2026 14:29:10 +0000 Subject: [PATCH 3/6] small fixes fixed consistency of text_var references in phone function fixed tests with reference to redundant variable and added regexp to expect_error tests --- R/limpiar_phones.R | 36 +++++++++++++++++++++++----- man/limpiar_phones.Rd | 8 ++++++- tests/testthat/test-limpiar_phones.R | 15 +++++------- 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/R/limpiar_phones.R b/R/limpiar_phones.R index 7e3d7b1..c330068 100644 --- a/R/limpiar_phones.R +++ b/R/limpiar_phones.R @@ -52,13 +52,19 @@ #' #' # Default example #' phone_examples %>% -#' limpiar_phones(text_var = text_var) %>% +#' limpiar_phones(text_var = text_var, aggressive = FALSE) %>% #' dplyr::select(text_var) #' #' # More aggressive version, catching sequences of digits between 7-15 in length #' phone_examples %>% #' limpiar_phones(text_var = text_var, aggressive = TRUE) %>% #' dplyr::select(text_var) +#' +#' # Filter out rows containing phone numbers +#' phone_examples %>% +#' limpiar_phones(text_var = text_var, aggressive = FALSE) %>% +#' dplyr::filter(phone_number_flag == FALSE) %>% +#' dplyr::select(id, text_var) #' #' @export #' @@ -67,11 +73,29 @@ limpiar_phones <- function(df, aggressive = TRUE, tag = "None") { + # check data exists and is correct type + if (!is.data.frame(df)) { + stop("'df' must be a data.frame or tibble, but got type: ", + class(df)[1]) + } + + # check aggressive is logical + if (!is.logical(aggressive)) { + stop("Parameter 'aggressive' must be logical (TRUE/FALSE), but got type: ", + class(aggressive)[1]) + } + + # check tag exists and is a string + if (!is.character(tag) || length(tag) != 1) { + stop("Parameter 'tag' must be a single character string, but got type: ", + class(tag)[1]) + } + # handle both quoted and unquoted column names - text_var <- rlang::ensym(text_var) + text_sym <- rlang::ensym(text_var) # check text var is correct type - col_data <- dplyr::pull(df, {{ text_var }}) + col_data <- dplyr::pull(df, !!text_sym) if (!is.character(col_data)) { stop("Parameter 'text_var' must be a character vector (string type), but got type: ", @@ -140,14 +164,14 @@ limpiar_phones <- function(df, # if tag not changed from default value, only create flag column df <- df %>% dplyr::mutate( - phone_number_flag = stringr::str_detect(!!text_var, full_pattern) + phone_number_flag = stringr::str_detect(!!text_sym, full_pattern) ) } else { # create flag column and replace phone numbers with tag df <- df %>% dplyr::mutate( - phone_number_flag = stringr::str_detect(!!text_var, full_pattern), - !!text_var := stringr::str_replace_all(!!text_var, full_pattern, tag) + phone_number_flag = stringr::str_detect(!!text_sym, full_pattern), + !!text_sym := stringr::str_replace_all(!!text_sym, full_pattern, tag) ) } diff --git a/man/limpiar_phones.Rd b/man/limpiar_phones.Rd index 354fb8d..7bd8dae 100644 --- a/man/limpiar_phones.Rd +++ b/man/limpiar_phones.Rd @@ -64,7 +64,7 @@ phone_examples <- tibble::tibble( # Default example phone_examples \%>\% - limpiar_phones(text_var = text_var) \%>\% + limpiar_phones(text_var = text_var, aggressive = FALSE) \%>\% dplyr::select(text_var) # More aggressive version, catching sequences of digits between 7-15 in length @@ -72,4 +72,10 @@ phone_examples \%>\% limpiar_phones(text_var = text_var, aggressive = TRUE) \%>\% dplyr::select(text_var) +# Filter out rows containing phone numbers +phone_examples \%>\% + limpiar_phones(text_var = text_var, aggressive = FALSE) \%>\% + dplyr::filter(phone_number_flag == FALSE) \%>\% + dplyr::select(id, text_var) + } diff --git a/tests/testthat/test-limpiar_phones.R b/tests/testthat/test-limpiar_phones.R index 3484446..a20a4b2 100644 --- a/tests/testthat/test-limpiar_phones.R +++ b/tests/testthat/test-limpiar_phones.R @@ -116,9 +116,8 @@ test_that("Data input test", { "string", text_var = text_var, aggressive = FALSE, - tag = "phone_number", - remove = FALSE - )); + tag = "phone_number" + ), regexp = "'df' must be a data.frame or tibble, but got type: character"); }) test_that("text_var input test", { @@ -127,9 +126,8 @@ test_that("text_var input test", { input_example_1, text_var = "not_a_column", aggressive = FALSE, - tag = "phone_number", - remove = FALSE - )); + tag = "phone_number" + ), regexp = "object 'not_a_column' not found"); }) test_that("Agressive input test", { @@ -138,9 +136,8 @@ test_that("Agressive input test", { input_example_1, text_var = text_var, aggressive = "string", - tag = "phone_number", - remove = FALSE - )); + tag = "phone_number" + ), regexp = "Parameter 'aggressive' must be logical"); }) test_that("text input as string test", { From 073a3b998b0094a974a55edbfe93bd72be569d00 Mon Sep 17 00:00:00 2001 From: benjessup-samy Date: Mon, 2 Feb 2026 14:50:37 +0000 Subject: [PATCH 4/6] changing name to phone_numbers limpiar_phones -> limpiar_phone_numbers --- NAMESPACE | 2 +- R/limpiar_phones.R | 8 ++++---- README.Rmd | 2 +- README.md | 2 +- _pkgdown.yml | 2 +- ...mpiar_phones.Rd => limpiar_phone_numbers.Rd} | 17 +++++++++++------ tests/testthat/test-limpiar_phones.R | 14 +++++++------- 7 files changed, 26 insertions(+), 21 deletions(-) rename man/{limpiar_phones.Rd => limpiar_phone_numbers.Rd} (84%) diff --git a/NAMESPACE b/NAMESPACE index 81c758f..44d282d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(limpiar_link_click) export(limpiar_link_click_reverse) export(limpiar_na_cols) export(limpiar_non_ascii) -export(limpiar_phones) +export(limpiar_phone_numbers) export(limpiar_pos_annotate) export(limpiar_pos_import_model) export(limpiar_pp_companies) diff --git a/R/limpiar_phones.R b/R/limpiar_phones.R index c330068..e7cd1ec 100644 --- a/R/limpiar_phones.R +++ b/R/limpiar_phones.R @@ -52,23 +52,23 @@ #' #' # Default example #' phone_examples %>% -#' limpiar_phones(text_var = text_var, aggressive = FALSE) %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% #' dplyr::select(text_var) #' #' # More aggressive version, catching sequences of digits between 7-15 in length #' phone_examples %>% -#' limpiar_phones(text_var = text_var, aggressive = TRUE) %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = TRUE) %>% #' dplyr::select(text_var) #' #' # Filter out rows containing phone numbers #' phone_examples %>% -#' limpiar_phones(text_var = text_var, aggressive = FALSE) %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% #' dplyr::filter(phone_number_flag == FALSE) %>% #' dplyr::select(id, text_var) #' #' @export #' -limpiar_phones <- function(df, +limpiar_phone_numbers <- function(df, text_var = mention_content, aggressive = TRUE, tag = "None") { diff --git a/README.Rmd b/README.Rmd index 788a985..8e66faf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -65,7 +65,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | -| [limpiar_phones](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | +| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/README.md b/README.md index ffaaab3..c7700bb 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | -| [limpiar_phones](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | +| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/_pkgdown.yml b/_pkgdown.yml index 70f99da..a904fa4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -94,7 +94,7 @@ reference: - limpiar_duplicates - limpiar_retweets - limpiar_spam_grams - - limpiar_phones + - limpiar_phone_numbers - title: Utility Functions desc: > diff --git a/man/limpiar_phones.Rd b/man/limpiar_phone_numbers.Rd similarity index 84% rename from man/limpiar_phones.Rd rename to man/limpiar_phone_numbers.Rd index 7bd8dae..d974529 100644 --- a/man/limpiar_phones.Rd +++ b/man/limpiar_phone_numbers.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/limpiar_phones.R -\name{limpiar_phones} -\alias{limpiar_phones} +\name{limpiar_phone_numbers} +\alias{limpiar_phone_numbers} \title{Clean phone numbers from text} \usage{ -limpiar_phones(df, text_var = mention_content, aggressive = TRUE, tag = "None") +limpiar_phone_numbers( + df, + text_var = mention_content, + aggressive = TRUE, + tag = "None" +) } \arguments{ \item{df}{Name of DataFrame or Tibble object} @@ -64,17 +69,17 @@ phone_examples <- tibble::tibble( # Default example phone_examples \%>\% - limpiar_phones(text_var = text_var, aggressive = FALSE) \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) \%>\% dplyr::select(text_var) # More aggressive version, catching sequences of digits between 7-15 in length phone_examples \%>\% - limpiar_phones(text_var = text_var, aggressive = TRUE) \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = TRUE) \%>\% dplyr::select(text_var) # Filter out rows containing phone numbers phone_examples \%>\% - limpiar_phones(text_var = text_var, aggressive = FALSE) \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) \%>\% dplyr::filter(phone_number_flag == FALSE) \%>\% dplyr::select(id, text_var) diff --git a/tests/testthat/test-limpiar_phones.R b/tests/testthat/test-limpiar_phones.R index a20a4b2..123d3b9 100644 --- a/tests/testthat/test-limpiar_phones.R +++ b/tests/testthat/test-limpiar_phones.R @@ -79,7 +79,7 @@ output_example_1d <- tibble::tibble( # Tests test_that("Example test", { # test example behaviour - output <- limpiar_phones( + output <- limpiar_phone_numbers( input_example_1, text_var = text_var, aggressive = FALSE, @@ -90,7 +90,7 @@ test_that("Example test", { test_that("Agressive test", { # test aggressive works as expected - output <- limpiar_phones( + output <- limpiar_phone_numbers( input_example_1, text_var = text_var, aggressive = TRUE, @@ -103,7 +103,7 @@ test_that("Agressive test", { test_that("No tag test", { # Test that no-tag behaviour is expected - output <- limpiar_phones( + output <- limpiar_phone_numbers( input_example_1, text_var = text_var ); @@ -112,7 +112,7 @@ test_that("No tag test", { test_that("Data input test", { # Test that data exists - expect_error(limpiar_phones( + expect_error(limpiar_phone_numbers( "string", text_var = text_var, aggressive = FALSE, @@ -122,7 +122,7 @@ test_that("Data input test", { test_that("text_var input test", { # Test that text_var exists as a column - expect_error(limpiar_phones( + expect_error(limpiar_phone_numbers( input_example_1, text_var = "not_a_column", aggressive = FALSE, @@ -132,7 +132,7 @@ test_that("text_var input test", { test_that("Agressive input test", { # Test that aggressive must be a bool - expect_error(limpiar_phones( + expect_error(limpiar_phone_numbers( input_example_1, text_var = text_var, aggressive = "string", @@ -142,7 +142,7 @@ test_that("Agressive input test", { test_that("text input as string test", { # test example behaviour - output <- limpiar_phones( + output <- limpiar_phone_numbers( input_example_1, text_var = "text_var", aggressive = FALSE, From 8a5155c77cbab3119c1da7002eb67b920e01e738 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Mon, 2 Feb 2026 15:34:31 +0000 Subject: [PATCH 5/6] Bump version in DESCRIPTION , add item to NEWS.md, have pkgdown render the news sidebar automatically --- DESCRIPTION | 2 +- NEWS.md | 3 +++ _pkgdown.yml | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index da6b643..8f11663 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: LimpiaR Title: LimpiaR -Version: 1.1.1 +Version: 1.1.2 Authors@R: as.person(c( "Jack Penzer [cre]", diff --git a/NEWS.md b/NEWS.md index f86f7a9..5a8bacb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# LimpiaR 1.1.2 +- `limpiar_phone_numbers()` function introduced - flags documents that have phone numbers in the text, see package's function reference section and README for more information + # LimpiaR 1.1.1 - Small fix for tests failing following a package change in the CI/CD VM diff --git a/_pkgdown.yml b/_pkgdown.yml index a904fa4..c541c75 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -120,3 +120,5 @@ repo: development: mode: auto + version_label: default + version_tooltip: "Version" From a7ba33541b729a676c264c51f574f02c4f4f8229 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Mon, 2 Feb 2026 15:38:36 +0000 Subject: [PATCH 6/6] add BEN as a contributor, update m email, update copyright holder --- DESCRIPTION | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8f11663..e637ce8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,11 +3,12 @@ Title: LimpiaR Version: 1.1.2 Authors@R: as.person(c( - "Jack Penzer [cre]", + "Jack Penzer [cre]", "Tim Mooney [aut]", - "SHARE Creative [cph]" + "Ben Jessup [aut]", + "SAMY[cph]" )) -Description: SHARE & SAMY Group's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed. +Description: SAMY Data Science's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed. URL: https://jpcompartir.github.io/LimpiaR License: MIT + file LICENSE Encoding: UTF-8