diff --git a/.github/scripts/check-bibliography-dois.R b/.github/scripts/check-bibliography-dois.R new file mode 100755 index 00000000..e077110b --- /dev/null +++ b/.github/scripts/check-bibliography-dois.R @@ -0,0 +1,376 @@ +#!/usr/bin/env Rscript +# Check bibliography files for DOI requirements: +# 1. Every book and article must have a DOI field +# 2. Every DOI must resolve to a valid URL +# 3. Reference information must match the document at the DOI URL + +suppressPackageStartupMessages({ + library(bib2df) + library(httr) + library(jsonlite) + library(stringr) +}) + +#' Parse BibTeX file and extract entries +#' +#' @param filepath Path to BibTeX file +#' @return Data frame of bibliography entries +parse_bibtex_file <- function(filepath) { + tryCatch({ + bib_df <- bib2df(filepath) + return(bib_df) + }, error = function(e) { + cat(sprintf("Error parsing BibTeX file: %s\n", e$message)) + return(NULL) + }) +} + +#' Check if entry has a DOI field +#' +#' @param entry Single bibliography entry (row from data frame) +#' @return List with has_doi (logical) and error_message (string or NULL) +check_doi_field <- function(entry) { + entry_type <- tolower(entry$CATEGORY) + entry_key <- entry$BIBTEXKEY + + if (entry_type %in% c("book", "article")) { + if (is.na(entry$DOI) || entry$DOI == "") { + return(list( + has_doi = FALSE, + error = sprintf("Entry '%s' (%s) is missing DOI field", entry_key, entry_type) + )) + } + } + + return(list(has_doi = TRUE, error = NULL)) +} + +#' Validate that a DOI resolves to a valid URL +#' +#' @param doi DOI string +#' @return List with is_valid, error_message, and status_code +validate_doi_url <- function(doi) { + # Clean up DOI + doi <- trimws(doi) + + # Extract just the DOI identifier + doi_match <- str_extract(doi, "10\\.\\d+/[^\\s]+") + + if (is.na(doi_match)) { + return(list( + is_valid = FALSE, + error = sprintf("Invalid DOI format: %s", doi), + status_code = NULL + )) + } + + doi_identifier <- doi_match + doi_url <- sprintf("https://doi.org/%s", doi_identifier) + + tryCatch({ + response <- GET( + doi_url, + timeout(10), + user_agent("Mozilla/5.0 (compatible; BibliographyChecker/1.0)") + ) + + status_code <- status_code(response) + + if (status_code == 200) { + return(list( + is_valid = TRUE, + error = NULL, + status_code = status_code + )) + } else { + return(list( + is_valid = FALSE, + error = sprintf("DOI URL returned status %d", status_code), + status_code = status_code + )) + } + }, error = function(e) { + return(list( + is_valid = FALSE, + error = sprintf("Error accessing DOI: %s", e$message), + status_code = NULL + )) + }) +} + +#' Get DOI metadata from CrossRef API +#' +#' @param doi DOI string +#' @return Metadata list or NULL if failed +get_doi_metadata <- function(doi) { + doi <- trimws(doi) + doi_match <- str_extract(doi, "10\\.\\d+/[^\\s]+") + + if (is.na(doi_match)) { + return(NULL) + } + + doi_identifier <- doi_match + api_url <- sprintf("https://api.crossref.org/works/%s", doi_identifier) + + tryCatch({ + response <- GET( + api_url, + timeout(10), + user_agent("Mozilla/5.0 (compatible; BibliographyChecker/1.0)") + ) + + if (status_code(response) == 200) { + data <- fromJSON(content(response, as = "text", encoding = "UTF-8")) + return(data$message) + } + }, error = function(e) { + # Failed to fetch metadata + }) + + return(NULL) +} + +#' Normalize string for comparison +#' +#' @param s String to normalize +#' @return Normalized string +normalize_string <- function(s) { + if (is.na(s) || s == "") { + return("") + } + + # Convert to lowercase, remove punctuation, remove extra whitespace + s <- tolower(s) + s <- str_replace_all(s, "[^a-zA-Z0-9\\s]", "") + s <- str_replace_all(s, "\\s+", " ") + return(trimws(s)) +} + +#' Compare BibTeX entry with DOI metadata +#' +#' @param entry Bibliography entry +#' @param metadata CrossRef metadata +#' @return List with match (logical) and warnings (character vector) +compare_metadata <- function(entry, metadata) { + warnings <- c() + + # Check title + if (!is.na(entry$TITLE) && !is.null(metadata$title)) { + bib_title <- normalize_string(entry$TITLE) + + crossref_title <- metadata$title + if (is.list(crossref_title) && length(crossref_title) > 0) { + crossref_title <- crossref_title[[1]] + } + crossref_title <- normalize_string(as.character(crossref_title)) + + # Check word overlap (at least 50%) + if (bib_title != "" && crossref_title != "") { + bib_words <- str_split(bib_title, "\\s+")[[1]] + crossref_words <- str_split(crossref_title, "\\s+")[[1]] + + if (length(bib_words) > 0 && length(crossref_words) > 0) { + overlap <- length(intersect(bib_words, crossref_words)) + total <- min(length(bib_words), length(crossref_words)) + + if (total > 0 && overlap / total < 0.5) { + warnings <- c(warnings, sprintf( + "Title mismatch: BibTeX='%s' vs DOI='%s'", + entry$TITLE, metadata$title + )) + } + } + } + } + + # Check author (basic check) + if (!is.na(entry$AUTHOR) && !is.null(metadata$author)) { + bib_author <- normalize_string(entry$AUTHOR) + + crossref_authors <- metadata$author + if (is.data.frame(crossref_authors) && nrow(crossref_authors) > 0) { + family_names <- crossref_authors$family[!is.na(crossref_authors$family)] + + if (length(family_names) > 0) { + found_match <- FALSE + for (name in family_names) { + norm_name <- normalize_string(name) + if (norm_name != "" && grepl(norm_name, bib_author)) { + found_match <- TRUE + break + } + } + + if (!found_match) { + warnings <- c(warnings, sprintf( + "Author mismatch: BibTeX='%s' vs DOI authors", + entry$AUTHOR + )) + } + } + } + } + + # Check year + if (!is.na(entry$YEAR)) { + bib_year <- as.character(entry$YEAR) + + # Try published-print first, then published-online + crossref_year <- NULL + if (!is.null(metadata$`published-print`$`date-parts`)) { + date_parts <- metadata$`published-print`$`date-parts` + if (length(date_parts) > 0 && length(date_parts[[1]]) > 0) { + crossref_year <- as.character(date_parts[[1]][1]) + } + } + + if (is.null(crossref_year) && !is.null(metadata$`published-online`$`date-parts`)) { + date_parts <- metadata$`published-online`$`date-parts` + if (length(date_parts) > 0 && length(date_parts[[1]]) > 0) { + crossref_year <- as.character(date_parts[[1]][1]) + } + } + + if (!is.null(crossref_year) && bib_year != crossref_year) { + warnings <- c(warnings, sprintf( + "Year mismatch: BibTeX='%s' vs DOI='%s'", + bib_year, crossref_year + )) + } + } + + return(list(match = TRUE, warnings = warnings)) +} + +#' Check bibliography file for DOI requirements +#' +#' @param filepath Path to bibliography file +#' @param verify_metadata Whether to verify metadata (default TRUE) +#' @return List with checked_count, errors_count, and error_messages +check_bibliography_file <- function(filepath, verify_metadata = TRUE) { + cat(sprintf("\nChecking %s...\n", filepath)) + + bib_df <- parse_bibtex_file(filepath) + if (is.null(bib_df)) { + return(list(checked_count = 0, errors_count = 1, errors = c("Failed to parse BibTeX file"))) + } + + errors <- c() + checked_count <- 0 + + for (i in seq_len(nrow(bib_df))) { + entry <- bib_df[i, ] + entry_type <- tolower(entry$CATEGORY) + + # Only check books and articles + if (!(entry_type %in% c("book", "article"))) { + next + } + + checked_count <- checked_count + 1 + cat(sprintf(" Checking %s '%s'...\n", entry_type, entry$BIBTEXKEY)) + + # Check 1: DOI field exists + doi_check <- check_doi_field(entry) + if (!doi_check$has_doi) { + errors <- c(errors, doi_check$error) + cat(sprintf(" ❌ %s\n", doi_check$error)) + next + } + + doi <- entry$DOI + cat(sprintf(" DOI: %s\n", doi)) + + # Check 2: DOI URL is valid + url_check <- validate_doi_url(doi) + if (!url_check$is_valid) { + error_msg <- sprintf("Entry '%s': %s", entry$BIBTEXKEY, url_check$error) + errors <- c(errors, error_msg) + cat(sprintf(" ❌ %s\n", error_msg)) + next + } else { + cat(sprintf(" ✓ DOI URL is valid (status %d)\n", url_check$status_code)) + } + + # Check 3: Metadata matches (if enabled) + if (verify_metadata) { + cat(" Fetching DOI metadata...\n") + metadata <- get_doi_metadata(doi) + + if (!is.null(metadata)) { + comparison <- compare_metadata(entry, metadata) + if (length(comparison$warnings) > 0) { + for (warning in comparison$warnings) { + cat(sprintf(" ⚠️ %s\n", warning)) + } + } else { + cat(" ✓ Metadata appears consistent\n") + } + } else { + cat(" ⚠️ Could not fetch metadata from CrossRef API\n") + } + + # Small delay to be nice to the API + Sys.sleep(0.5) + } + } + + return(list( + checked_count = checked_count, + errors_count = length(errors), + errors = errors + )) +} + +# Main execution +run_doi_validation <- function() { + args <- commandArgs(trailingOnly = TRUE) + + # Parse arguments + no_metadata_check <- "--no-metadata-check" %in% args + files <- args[!grepl("^--", args)] + + if (length(files) == 0) { + cat("Usage: check-bibliography-dois.R [--no-metadata-check] [file2.bib ...]\n") + quit(status = 1) + } + + total_checked <- 0 + total_errors <- 0 + all_errors <- c() + + for (filepath in files) { + if (!file.exists(filepath)) { + cat(sprintf("Error: File %s does not exist\n", filepath)) + quit(status = 1) + } + + result <- check_bibliography_file(filepath, verify_metadata = !no_metadata_check) + total_checked <- total_checked + result$checked_count + total_errors <- total_errors + result$errors_count + all_errors <- c(all_errors, result$errors) + } + + # Print summary + cat("\n") + cat(paste(rep("=", 70), collapse = ""), "\n") + cat("SUMMARY\n") + cat(paste(rep("=", 70), collapse = ""), "\n") + cat(sprintf("Total entries checked: %d\n", total_checked)) + cat(sprintf("Errors found: %d\n", total_errors)) + + if (total_errors > 0) { + cat("\nERRORS:\n") + for (error in all_errors) { + cat(sprintf(" • %s\n", error)) + } + quit(status = 1) + } else { + cat("\n✓ All checks passed!\n") + quit(status = 0) + } +} + +# Run main function +run_doi_validation() diff --git a/.github/scripts/check-non-standard-chars.py b/.github/scripts/check-non-standard-chars.py new file mode 100755 index 00000000..b6ae5898 --- /dev/null +++ b/.github/scripts/check-non-standard-chars.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Script to detect non-standard characters in .qmd and .R files. + +This script checks for curly quotes and other non-standard characters that +can cause issues in Quarto/R projects, such as: +- " (U+201C) - Left double quotation mark +- " (U+201D) - Right double quotation mark +- ' (U+2018) - Left single quotation mark +- ' (U+2019) - Right single quotation mark +- – (U+2013) - En dash +- — (U+2014) - Em dash + +These should typically be replaced with their ASCII equivalents: +- " (U+0022) - Quotation mark +- ' (U+0027) - Apostrophe +- - (U+002D) - Hyphen-minus +""" + +import sys +from pathlib import Path +from typing import List, Tuple, Dict + +# Non-standard characters to detect +NON_STANDARD_CHARS = { + '\u201C': 'Left double quotation mark', + '\u201D': 'Right double quotation mark', + '\u2018': 'Left single quotation mark', + '\u2019': 'Right single quotation mark', + '\u2013': 'En dash', + '\u2014': 'Em dash', +} + + +def check_file(file_path: Path) -> List[Tuple[int, int, str, str]]: + """ + Check a file for non-standard characters. + + Args: + file_path: Path to the file to check + + Returns: + List of tuples (line_number, column, character, description) + """ + issues = [] + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, start=1): + for col, char in enumerate(line, start=1): + if char in NON_STANDARD_CHARS: + issues.append(( + line_num, + col, + char, + NON_STANDARD_CHARS[char] + )) + except UnicodeDecodeError as e: + print(f"Error: {file_path} has encoding issues: {e}", file=sys.stderr) + return [] + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + return [] + + return issues + + +def find_files(root_dir: Path, extensions: List[str]) -> List[Path]: + """ + Find all files with given extensions in the directory tree. + + Args: + root_dir: Root directory to search + extensions: List of file extensions to search for (e.g., ['.qmd', '.R']) + + Returns: + List of matching file paths + """ + files = [] + for ext in extensions: + files.extend(root_dir.glob(f'**/*{ext}')) + return sorted(files) + + +def main() -> int: + """ + Main function to check all .qmd and .R files for non-standard characters. + + Returns: + 0 if no issues found, 1 if issues found + """ + root_dir = Path('.') + extensions = ['.qmd', '.R'] + + print("Checking for non-standard characters in .qmd and .R files...\n") + + files = find_files(root_dir, extensions) + + if not files: + print("No .qmd or .R files found.") + return 0 + + total_issues = 0 + files_with_issues: Dict[Path, List[Tuple[int, int, str, str]]] = {} + + for file_path in files: + issues = check_file(file_path) + if issues: + files_with_issues[file_path] = issues + total_issues += len(issues) + + if not files_with_issues: + print(f"✓ No non-standard characters found in {len(files)} file(s).") + return 0 + + # Print detailed report + print(f"✗ Found {total_issues} non-standard character(s) in {len(files_with_issues)} file(s):\n") + + for file_path, issues in files_with_issues.items(): + print(f"{file_path}:") + for line_num, col, char, description in issues: + print(f" Line {line_num}, Column {col}: {description}") + # Show the character in a visible way + print(f" Found: '{char}' (U+{ord(char):04X})") + print() + + print("Please replace these characters with their ASCII equivalents:") + print(' \u201C or \u201D -> " (standard double quote)') + print(' \u2018 or \u2019 -> \' (standard single quote)') + print(' \u2013 or \u2014 -> - (standard hyphen)') + print() + + return 1 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/.github/workflows/check-bibliography-dois.yml b/.github/workflows/check-bibliography-dois.yml new file mode 100644 index 00000000..156a5815 --- /dev/null +++ b/.github/workflows/check-bibliography-dois.yml @@ -0,0 +1,52 @@ +name: Check Bibliography DOIs + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + check-dois: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + any::pkgdown + any::rmarkdown + any::bib2df + any::httr + any::jsonlite + any::stringr + + - name: Find bibliography files + id: find-bibs + run: | + # Find all .bib files in the repository + BIB_FILES=$(find . -name "*.bib" -not -path "./.git/*" | tr '\n' ' ') + echo "Found bibliography files: $BIB_FILES" + echo "bib_files=$BIB_FILES" >> $GITHUB_OUTPUT + + - name: Check bibliography DOIs + if: steps.find-bibs.outputs.bib_files != '' + run: | + Rscript .github/scripts/check-bibliography-dois.R "$BIB_FILES" + env: + BIB_FILES: ${{ steps.find-bibs.outputs.bib_files }} + + - name: Summary + if: success() + run: | + echo "✓ All bibliography entries have valid DOIs" diff --git a/.github/workflows/check-links.yml b/.github/workflows/check-links.yml index 8a6f2648..7e224dda 100644 --- a/.github/workflows/check-links.yml +++ b/.github/workflows/check-links.yml @@ -15,11 +15,13 @@ jobs: runs-on: ubuntu-latest permissions: contents: read + issues: write steps: - name: Check out repository uses: actions/checkout@v4 - name: Check links in markdown and HTML files + id: lychee uses: lycheeverse/lychee-action@v2 with: # Check all .qmd (Quarto markdown), .md (markdown), and .html files @@ -28,3 +30,26 @@ jobs: fail: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Create issue on main branch if links are broken + if: failure() && github.ref == 'refs/heads/main' + run: | + # Create issue with broken link details + BODY=$(cat <<'EOF' + The link checker found broken links in the main branch. + + **Workflow Run:** ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + Please review the workflow logs above for details on which links are broken. + + This issue was automatically created by the link checker workflow. + + @copilot please fix the broken links found in the main branch. + EOF + ) + gh issue create \ + --title "Broken links detected in main branch" \ + --body "$BODY" \ + --label "bug,automated,copilot" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/check-non-standard-chars.yaml b/.github/workflows/check-non-standard-chars.yaml new file mode 100644 index 00000000..ebf6a8c5 --- /dev/null +++ b/.github/workflows/check-non-standard-chars.yaml @@ -0,0 +1,27 @@ +name: Check Non-Standard Characters + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + check-chars: + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Check for non-standard characters + run: python3 .github/scripts/check-non-standard-chars.py