diff --git a/DESCRIPTION b/DESCRIPTION index ae79cb3..e637ce8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,19 +1,20 @@ Package: LimpiaR Title: LimpiaR -Version: 1.1.1 +Version: 1.1.2 Authors@R: as.person(c( - "Jack Penzer [cre]", + "Jack Penzer [cre]", "Tim Mooney [aut]", - "SHARE Creative [cph]" + "Ben Jessup [aut]", + "SAMY[cph]" )) -Description: SHARE & SAMY Group's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed. +Description: SAMY Data Science's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed. URL: https://jpcompartir.github.io/LimpiaR License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: dplyr (>= 1.0.10), rlang (>= 1.0.6), diff --git a/NAMESPACE b/NAMESPACE index 86d390d..44d282d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(limpiar_link_click) export(limpiar_link_click_reverse) export(limpiar_na_cols) export(limpiar_non_ascii) +export(limpiar_phone_numbers) export(limpiar_pos_annotate) export(limpiar_pos_import_model) export(limpiar_pp_companies) diff --git a/NEWS.md b/NEWS.md index f86f7a9..5a8bacb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# LimpiaR 1.1.2 +- `limpiar_phone_numbers()` function introduced - flags documents that have phone numbers in the text, see package's function reference section and README for more information + # LimpiaR 1.1.1 - Small fix for tests failing following a package change in the CI/CD VM diff --git a/R/limpiar_phones.R b/R/limpiar_phones.R new file mode 100644 index 0000000..e7cd1ec --- /dev/null +++ b/R/limpiar_phones.R @@ -0,0 +1,179 @@ +#' Clean phone numbers from text +#' +#' Function creates a flag column for posts containing phone numbers. +#' Catches various phone number formats, i.e US, UK, European etc. +#' By default the function only replaces phone numbers in a recognised format. +#' Can also be set to be more aggressive and catch plain digit sequences (7-15 digits). +#' Can also be set to replace phone_numbers with a string. +#' +#' @param df Name of DataFrame or Tibble object +#' @param text_var Name of text variable/character vector +#' @param aggressive Bool: If TRUE, also catches plain digit sequences (7-15 digits) +#' @param tag String: Default = "None", if supplied replaces phone numbers with string +#' +#' @return The DataFrame or Tibble object with phone number flag column +#' +#' @details +#' Matches: +#' \itemize{ +#' \item International: +1 555-123-4567, +44 20 1234 5678 +#' \item US/Canada: (555) 123-4567, 555-123-4567 +#' \item UK: 07951 902 146, 01786 475545 +#' \item European: 77 54 33 33 +#' \item Latin American: 4782-0699 +#' \item Local: 555-1234 +#' } +#' Also matches when aggressive = TRUE: +#' \itemize{ +#' \item Plain digits: 07546104638, 1234567890 +#' } +#' +#' Avoids matching: +#' \itemize{ +#' \item 09:00-17:00 +#' \item 192.168.1.1 +#' \item $1,234,567 +#' \item 1,000,000,000 +#' \item 1995-2025 +#' } +#' +#' @examples +#' # Example data +#' phone_examples <- tibble::tibble( +#' id = 1:5, +#' text_var = c( +#' "Call me at 555-123-4567 or (555) 123-4568", +#' "WhatsApp +44 20 1234 5678", +#' "Contact: 07506308688", +#' "Meeting at 09:00-17:00, call 4782-0699", +#' "I earned £100,000,000 between 1995-2025" +#' ) +#' ) +#' +#' # Default example +#' phone_examples %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% +#' dplyr::select(text_var) +#' +#' # More aggressive version, catching sequences of digits between 7-15 in length +#' phone_examples %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = TRUE) %>% +#' dplyr::select(text_var) +#' +#' # Filter out rows containing phone numbers +#' phone_examples %>% +#' limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% +#' dplyr::filter(phone_number_flag == FALSE) %>% +#' dplyr::select(id, text_var) +#' +#' @export +#' +limpiar_phone_numbers <- function(df, + text_var = mention_content, + aggressive = TRUE, + tag = "None") { + + # check data exists and is correct type + if (!is.data.frame(df)) { + stop("'df' must be a data.frame or tibble, but got type: ", + class(df)[1]) + } + + # check aggressive is logical + if (!is.logical(aggressive)) { + stop("Parameter 'aggressive' must be logical (TRUE/FALSE), but got type: ", + class(aggressive)[1]) + } + + # check tag exists and is a string + if (!is.character(tag) || length(tag) != 1) { + stop("Parameter 'tag' must be a single character string, but got type: ", + class(tag)[1]) + } + + # handle both quoted and unquoted column names + text_sym <- rlang::ensym(text_var) + + # check text var is correct type + col_data <- dplyr::pull(df, !!text_sym) + + if (!is.character(col_data)) { + stop("Parameter 'text_var' must be a character vector (string type), but got type: ", + class(col_data)[1]) + } + + # phone number format patterns + patterns <- c( + # international with + prefix + # matches +[1-3 digits][7-12 more digits with optional separators] + "\\+\\d{1,3}(?:[\\s.-]?\\d){7,12}", + + # us/canada format with parentheses + # matches (XXX) XXX-XXXX or (XXX) XXX XXXX + "\\(\\d{3}\\)[\\s.-]?\\d{3}[\\s.-]?\\d{4}", + + # standard 10-digit with separators + # matches XXX-XXX-XXXX or XXX XXX XXXX or XXX.XXX.XXXX + "\\d{3}[\\s.-]\\d{3}[\\s.-]\\d{4}", + + # uk format (5-3-3) + # matches 0XXXX XXX XXX + "0\\d{4}\\s\\d{3}\\s\\d{3,6}", + + # uk format (5-6 with single space) + # matches 0XXXX XXXXXX + "0\\d{4}\\s\\d{6}", + + # european short format (8 digits with spaces) + # matches XX XX XX XX (exactly 8 digits) + "(?% + dplyr::mutate( + phone_number_flag = stringr::str_detect(!!text_sym, full_pattern) + ) + } else { + # create flag column and replace phone numbers with tag + df <- df %>% + dplyr::mutate( + phone_number_flag = stringr::str_detect(!!text_sym, full_pattern), + !!text_sym := stringr::str_replace_all(!!text_sym, full_pattern, tag) + ) + } + + return(df) +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 86fc087..8e66faf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -65,6 +65,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | +| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/README.md b/README.md index 45e7b56..c7700bb 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ Functions for removing unwanted posts entirely (rather than cleaning). | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content | | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns | | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis | +| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising | ## Utility diff --git a/_pkgdown.yml b/_pkgdown.yml index 14f6566..c541c75 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -94,6 +94,7 @@ reference: - limpiar_duplicates - limpiar_retweets - limpiar_spam_grams + - limpiar_phone_numbers - title: Utility Functions desc: > @@ -119,3 +120,5 @@ repo: development: mode: auto + version_label: default + version_tooltip: "Version" diff --git a/man/limpiar_phone_numbers.Rd b/man/limpiar_phone_numbers.Rd new file mode 100644 index 0000000..d974529 --- /dev/null +++ b/man/limpiar_phone_numbers.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_phones.R +\name{limpiar_phone_numbers} +\alias{limpiar_phone_numbers} +\title{Clean phone numbers from text} +\usage{ +limpiar_phone_numbers( + df, + text_var = mention_content, + aggressive = TRUE, + tag = "None" +) +} +\arguments{ +\item{df}{Name of DataFrame or Tibble object} + +\item{text_var}{Name of text variable/character vector} + +\item{aggressive}{Bool: If TRUE, also catches plain digit sequences (7-15 digits)} + +\item{tag}{String: Default = "None", if supplied replaces phone numbers with string} +} +\value{ +The DataFrame or Tibble object with phone number flag column +} +\description{ +Function creates a flag column for posts containing phone numbers. +Catches various phone number formats, i.e US, UK, European etc. +By default the function only replaces phone numbers in a recognised format. +Can also be set to be more aggressive and catch plain digit sequences (7-15 digits). +Can also be set to replace phone_numbers with a string. +} +\details{ +Matches: +\itemize{ +\item International: +1 555-123-4567, +44 20 1234 5678 +\item US/Canada: (555) 123-4567, 555-123-4567 +\item UK: 07951 902 146, 01786 475545 +\item European: 77 54 33 33 +\item Latin American: 4782-0699 +\item Local: 555-1234 +} +Also matches when aggressive = TRUE: +\itemize{ +\item Plain digits: 07546104638, 1234567890 +} + +Avoids matching: +\itemize{ +\item 09:00-17:00 +\item 192.168.1.1 +\item $1,234,567 +\item 1,000,000,000 +\item 1995-2025 +} +} +\examples{ +# Example data +phone_examples <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025" + ) +) + +# Default example +phone_examples \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) \%>\% + dplyr::select(text_var) + +# More aggressive version, catching sequences of digits between 7-15 in length +phone_examples \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = TRUE) \%>\% + dplyr::select(text_var) + +# Filter out rows containing phone numbers +phone_examples \%>\% + limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) \%>\% + dplyr::filter(phone_number_flag == FALSE) \%>\% + dplyr::select(id, text_var) + +} diff --git a/tests/testthat/test-limpiar_phones.R b/tests/testthat/test-limpiar_phones.R new file mode 100644 index 0000000..123d3b9 --- /dev/null +++ b/tests/testthat/test-limpiar_phones.R @@ -0,0 +1,152 @@ +input_example_1 <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025") +) + +output_example_1a <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at phone_number or phone_number", + "WhatsApp phone_number", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call phone_number", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + FALSE, + TRUE, + FALSE + ) +) + +output_example_1b <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at phone_number or phone_number", + "WhatsApp phone_number", + "Contact: phone_number", + "Meeting at 09:00-17:00, call phone_number", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + TRUE, + TRUE, + FALSE + ) +) + +output_example_1c <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + FALSE, + TRUE, + FALSE + ) +) + +output_example_1d <- tibble::tibble( + id = 1:5, + text_var = c( + "Call me at 555-123-4567 or (555) 123-4568", + "WhatsApp +44 20 1234 5678", + "Contact: 07506308688", + "Meeting at 09:00-17:00, call 4782-0699", + "I earned £100,000,000 between 1995-2025"), + phone_number_flag = c( + TRUE, + TRUE, + TRUE, + TRUE, + FALSE + ) +) + +# Tests +test_that("Example test", { + # test example behaviour + output <- limpiar_phone_numbers( + input_example_1, + text_var = text_var, + aggressive = FALSE, + tag = "phone_number" + ); + expect_equal(output, output_example_1a); +}) + +test_that("Agressive test", { + # test aggressive works as expected + output <- limpiar_phone_numbers( + input_example_1, + text_var = text_var, + aggressive = TRUE, + tag = "phone_number" + ); + + # Test that the result is the correct value + expect_equal(output, output_example_1b); +}) + +test_that("No tag test", { + # Test that no-tag behaviour is expected + output <- limpiar_phone_numbers( + input_example_1, + text_var = text_var + ); + expect_equal(output, output_example_1d); +}) + +test_that("Data input test", { + # Test that data exists + expect_error(limpiar_phone_numbers( + "string", + text_var = text_var, + aggressive = FALSE, + tag = "phone_number" + ), regexp = "'df' must be a data.frame or tibble, but got type: character"); +}) + +test_that("text_var input test", { + # Test that text_var exists as a column + expect_error(limpiar_phone_numbers( + input_example_1, + text_var = "not_a_column", + aggressive = FALSE, + tag = "phone_number" + ), regexp = "object 'not_a_column' not found"); +}) + +test_that("Agressive input test", { + # Test that aggressive must be a bool + expect_error(limpiar_phone_numbers( + input_example_1, + text_var = text_var, + aggressive = "string", + tag = "phone_number" + ), regexp = "Parameter 'aggressive' must be logical"); +}) + +test_that("text input as string test", { + # test example behaviour + output <- limpiar_phone_numbers( + input_example_1, + text_var = "text_var", + aggressive = FALSE, + tag = "phone_number" + ); + expect_equal(output, output_example_1a); +}) \ No newline at end of file