jpcompartir · jpcompartir · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,19 +1,20 @@
 Package: LimpiaR
 Title: LimpiaR
-Version: 1.1.1
+Version: 1.1.2
 Authors@R: 
     as.person(c(
-    "Jack Penzer <jack.penzer@sharecreative.com> [cre]",
+    "Jack Penzer <jack.penzer@samy.com> [cre]",
     "Tim Mooney <tim.mooney@sharecreative.com> [aut]",
-    "SHARE Creative [cph]"
+    "Ben Jessup <ben.jessup@samy.com> [aut]",
+    "SAMY[cph]"
     ))
-Description: SHARE & SAMY Group's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed.
+Description: SAMY Data Science's dedicated pre-processing suite for Spanish & English data sets. A range of functions for getting text data ready to be analysed.
 URL: https://jpcompartir.github.io/LimpiaR
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Imports:
     dplyr (>= 1.0.10),
     rlang (>= 1.0.6),

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(limpiar_link_click)
 export(limpiar_link_click_reverse)
 export(limpiar_na_cols)
 export(limpiar_non_ascii)
+export(limpiar_phone_numbers)
 export(limpiar_pos_annotate)
 export(limpiar_pos_import_model)
 export(limpiar_pp_companies)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# LimpiaR 1.1.2
+- `limpiar_phone_numbers()` function introduced - flags documents that have phone numbers in the text, see package's function reference section and README for more information
+
 # LimpiaR 1.1.1
 
 -   Small fix for tests failing following a package change in the CI/CD VM

diff --git a/R/limpiar_phones.R b/R/limpiar_phones.R
@@ -0,0 +1,179 @@
+#' Clean phone numbers from text
+#'
+#' Function creates a flag column for posts containing phone numbers. 
+#' Catches various phone number formats, i.e US, UK, European etc.
+#' By default the function only replaces phone numbers in a recognised format.
+#' Can also be set to be more aggressive and catch plain digit sequences (7-15 digits).
+#' Can also be set to replace phone_numbers with a string.
+#'
+#' @param df Name of DataFrame or Tibble object
+#' @param text_var Name of text variable/character vector
+#' @param aggressive Bool: If TRUE, also catches plain digit sequences (7-15 digits)
+#' @param tag String: Default = "None", if supplied replaces phone numbers with string
+#'
+#' @return The DataFrame or Tibble object with phone number flag column
+#'
+#' @details
+#' Matches:
+#' \itemize{
+#'   \item International: +1 555-123-4567, +44 20 1234 5678
+#'   \item US/Canada: (555) 123-4567, 555-123-4567
+#'   \item UK: 07951 902 146, 01786 475545
+#'   \item European: 77 54 33 33
+#'   \item Latin American: 4782-0699
+#'   \item Local: 555-1234
+#' }
+#' Also matches when aggressive = TRUE:
+#' \itemize{
+#'   \item Plain digits: 07546104638, 1234567890
+#' }
+#'
+#' Avoids matching:
+#' \itemize{
+#'   \item 09:00-17:00
+#'   \item 192.168.1.1
+#'   \item $1,234,567
+#'   \item 1,000,000,000
+#'   \item 1995-2025
+#' }
+#'
+#' @examples
+#' # Example data
+#' phone_examples <- tibble::tibble(
+#'   id = 1:5,
+#'   text_var = c(
+#'     "Call me at 555-123-4567 or (555) 123-4568",
+#'     "WhatsApp +44 20 1234 5678",
+#'     "Contact: 07506308688",
+#'     "Meeting at 09:00-17:00, call 4782-0699",
+#'     "I earned £100,000,000 between 1995-2025"
+#'   )
+#' )
+#'
+#' # Default example
+#' phone_examples %>% 
+#'   limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% 
+#'   dplyr::select(text_var)
+#'
+#' # More aggressive version, catching sequences of digits between 7-15 in length
+#' phone_examples %>% 
+#'   limpiar_phone_numbers(text_var = text_var, aggressive = TRUE) %>% 
+#'   dplyr::select(text_var)
+#' 
+#' # Filter out rows containing phone numbers
+#' phone_examples %>% 
+#'   limpiar_phone_numbers(text_var = text_var, aggressive = FALSE) %>% 
+#'   dplyr::filter(phone_number_flag == FALSE) %>% 
+#'   dplyr::select(id, text_var)
+#'
+#' @export
+#'
+limpiar_phone_numbers <- function(df, 
+                                   text_var = mention_content, 
+                                   aggressive = TRUE,
+                                   tag = "None") {
+
+  # check data exists and is correct type
+  if (!is.data.frame(df)) {
+    stop("'df' must be a data.frame or tibble, but got type: ",
+         class(df)[1])
+  }
+
+  # check aggressive is logical
+  if (!is.logical(aggressive)) {
+    stop("Parameter 'aggressive' must be logical (TRUE/FALSE), but got type: ",
+         class(aggressive)[1])
+  }
+
+  # check tag exists and is a string
+  if (!is.character(tag) || length(tag) != 1) {
+    stop("Parameter 'tag' must be a single character string, but got type: ",
+        class(tag)[1])
+  }
+
+  # handle both quoted and unquoted column names
+  text_sym <- rlang::ensym(text_var)
+
+  # check text var is correct type
+  col_data <- dplyr::pull(df, !!text_sym)
+
+  if (!is.character(col_data)) {
+    stop("Parameter 'text_var' must be a character vector (string type), but got type: ",
+         class(col_data)[1])
+  }
+
+  # phone number format patterns
+  patterns <- c(
+    # international with + prefix
+    # matches +[1-3 digits][7-12 more digits with optional separators]
+    "\\+\\d{1,3}(?:[\\s.-]?\\d){7,12}",
+
+    # us/canada format with parentheses
+    # matches (XXX) XXX-XXXX or (XXX) XXX XXXX
+    "\\(\\d{3}\\)[\\s.-]?\\d{3}[\\s.-]?\\d{4}",
+
+    # standard 10-digit with separators
+    # matches XXX-XXX-XXXX or XXX XXX XXXX or XXX.XXX.XXXX
+    "\\d{3}[\\s.-]\\d{3}[\\s.-]\\d{4}",
+
+    # uk format (5-3-3)
+    # matches 0XXXX XXX XXX
+    "0\\d{4}\\s\\d{3}\\s\\d{3,6}",
+
+    # uk format (5-6 with single space)
+    # matches 0XXXX XXXXXX
+    "0\\d{4}\\s\\d{6}",
+
+    # european short format (8 digits with spaces)
+    # matches XX XX XX XX (exactly 8 digits)
+    "(?<!\\d)\\d{2}\\s\\d{2}\\s\\d{2}\\s\\d{2}(?!\\d)",
+
+    # 8-digit format with hyphen (4-4 pattern)
+    # matches XXXX-XXXX where first digit is 3-9 (avoids year ranges like 1995-2025)
+    "(?<!\\d)[3-9]\\d{3}-\\d{4}(?!\\d)",
+
+    # 7-digit local format with separator
+    # matches XXX-XXXX or XXX XXXX
+    "(?<!\\d)\\d{3}[\\s.-]\\d{4}(?!\\d)"
+  )
+
+  # currency symbols
+  currency_symbols <- "\\u00a3\\u20ac\\u00a5"
+
+  # add aggressive pattern if enabled
+  if (aggressive) {
+    aggressive_pattern <- paste0(
+      "(?<![a-zA-Z.,$/", currency_symbols, "\\d])",
+      "\\d{7,15}",
+      "(?![a-zA-Z.,\\d])"
+    )
+    patterns <- c(patterns, aggressive_pattern)
+  }
+
+  # combine patterns with negative lookarounds
+  full_pattern <- paste0(
+    "(?<!\\d)",  # not preceded by digit
+    "(?:", 
+    paste(patterns, collapse = "|"),  # combining format patterns
+    ")",
+    "(?![:/=?&\\d-])"   # not followed by colon, digit, or dash
+  )
+
+  # output
+  if (tag == "None") {
+    # if tag not changed from default value, only create flag column
+    df <- df %>%
+      dplyr::mutate(
+        phone_number_flag = stringr::str_detect(!!text_sym, full_pattern)
+      )
+  } else {
+    # create flag column and replace phone numbers with tag
+    df <- df %>%
+      dplyr::mutate(
+        phone_number_flag = stringr::str_detect(!!text_sym, full_pattern),
+        !!text_sym := stringr::str_replace_all(!!text_sym, full_pattern, tag)
+      )
+  }
+
+  return(df)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -65,6 +65,7 @@ Functions for removing unwanted posts entirely (rather than cleaning).
 | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content |
 | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns |
 | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis |
+| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising |
 
 ## Utility
 

diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Functions for removing unwanted posts entirely (rather than cleaning).
 | [limpiar_duplicates](https://jpcompartir.github.io/LimpiaR/reference/limpiar_duplicates.html) | Removes duplicate content | Language-agnostic | Data cleaning | Also removes protected content |
 | [limpiar_retweets](https://jpcompartir.github.io/LimpiaR/reference/limpiar_retweets.html) | Removes retweet content | Language-agnostic | Social media cleaning | Identifies RT patterns |
 | [limpiar_spam_grams](https://jpcompartir.github.io/LimpiaR/reference/limpiar_spam_grams.html) | Removes spam-like patterns | Language-agnostic | Content filtering | Uses n-gram analysis |
+| [limpiar_phone_numbers](https://jpcompartir.github.io/LimpiaR/reference/limpiar_phones.html) | Tags phone numbers in posts | Language-agnostic | Content filtering | For removing spam/advertising |
 
 ## Utility
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -94,6 +94,7 @@ reference:
   - limpiar_duplicates
   - limpiar_retweets
   - limpiar_spam_grams
+  - limpiar_phone_numbers
 
 - title: Utility Functions
   desc: >
@@ -119,3 +120,5 @@ repo:
 
 development:
   mode: auto
+  version_label: default
+  version_tooltip: "Version"
diff --git a/man/limpiar_phone_numbers.Rd b/man/limpiar_phone_numbers.Rd