diff --git a/Cargo.lock b/Cargo.lock index 30208e3..1a3407a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -144,6 +144,7 @@ dependencies = [ "anyhow", "paxhtml", "paxhtml_tailwind", + "serde", "serde_json", "syntect", "wikitext_simplified", diff --git a/Cargo.toml b/Cargo.toml index b928ee7..6d1e16a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dependencies] anyhow = "1.0.98" +serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.140" wikitext_simplified = { git = "https://github.com/philpax/wikitext_simplified.git" } # wikitext_simplified = { path = "../wikitext_simplified/wikitext_simplified" } diff --git a/src/main.rs b/src/main.rs index c1e56c2..443c43a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,8 +5,9 @@ use std::{ sync::OnceLock, }; +use serde::{Deserialize, Serialize}; use template::{TemplateToInstantiate, Templates}; -use wikitext_simplified::{WikitextSimplifiedNode, Spanned, wikitext_util::parse_wiki_text_2}; +use wikitext_simplified::{Spanned, WikitextSimplifiedNode, wikitext_util::parse_wiki_text_2}; mod page_context; use page_context::PageContext; @@ -25,6 +26,16 @@ static SYNTAX_HIGHLIGHTER: OnceLock = OnceLock::new() struct GeneratedPages { // Maps directory path (relative to wiki root) to set of page names (without .html) pages_by_directory: BTreeMap>, + // Search index + search_index: SearchIndex, +} + +#[derive(Debug, Default, Serialize, Deserialize)] +struct SearchIndex { + /// List of page titles (index = page ID) + pages: Vec, + /// Inverted index: word -> list of (page_index, weight) tuples + words: BTreeMap>, } fn main() -> anyhow::Result<()> { @@ -229,6 +240,10 @@ fn generate_wiki(src: &Path, dst: &Path) -> anyhow::Result<()> { // Generate missing index pages generate_missing_index_pages(output_dir, &generated)?; + // Write search index + let search_index_json = serde_json::to_string(&generated.search_index)?; + fs::write(output_dir.join("search-index.json"), search_index_json)?; + redirect(&page_title_to_route_path("Main_Page").url_path()) .write_to_route(dst, paxhtml::RoutePath::new([], "index.html".to_string()))?; @@ -324,10 +339,72 @@ fn generate_wiki_folder( sub_page_name, }; + // Extract search data from the page + let mut all_text = String::new(); + let mut all_headings = Vec::new(); + for node in &simplified { + let (text, headings) = extract_search_data(&node.value); + all_text.push_str(&text); + all_text.push(' '); + all_headings.extend(headings); + } + + // Write search text to file + let search_text_path = output_html.with_extension("txt"); + fs::write(&search_text_path, all_text.trim())?; + + // Add page title to search index + let page_idx = generated.search_index.pages.len(); + generated + .search_index + .pages + .push(page_context.title.clone()); + + // Build word weight map for this page + let mut word_weights: BTreeMap = BTreeMap::new(); + + // Index words from content (weight 1) + for word in tokenize_text(&all_text) { + word_weights.entry(word).or_insert(1); + } + + // Index words from headings (weight 3, higher priority) + for heading in &all_headings { + for word in tokenize_text(heading) { + word_weights + .entry(word) + .and_modify(|w| *w = (*w).max(3)) + .or_insert(3); + } + } + + // Index words from title (weight 5, highest priority) + for word in tokenize_text(&page_context.title) { + word_weights + .entry(word) + .and_modify(|w| *w = (*w).max(5)) + .or_insert(5); + } + + // Add to inverted index with weights + for (word, weight) in word_weights { + generated + .search_index + .words + .entry(word) + .or_default() + .push((page_idx, weight)); + } + layout( &page_context.title, paxhtml::Element::from_iter(simplified.iter().map(|node| { - convert_wikitext_to_html(templates, pwt_configuration, &node.value, &page_context) + convert_wikitext_to_html( + templates, + pwt_configuration, + &node.value, + &page_context, + ) })), ) }; @@ -398,7 +475,19 @@ fn layout(title: &str, inner: paxhtml::Element) -> paxhtml::Document { -
+
+
+ + +
"Website"
@@ -412,6 +501,7 @@ fn layout(title: &str, inner: paxhtml::Element) -> paxhtml::Document { + }, @@ -441,7 +531,10 @@ fn convert_wikitext_to_html( let attributes = templates.instantiate( pwt_configuration, TemplateToInstantiate::Node(WikitextSimplifiedNode::Fragment { - children: attributes.iter().map(|n| empty_spanned(n.clone())).collect(), + children: attributes + .iter() + .map(|n| empty_spanned(n.clone())) + .collect(), }), &[], page_context, @@ -499,16 +592,22 @@ fn convert_wikitext_to_html( .unwrap_or_default() } - let convert_children = |templates: &mut Templates, children: &[Spanned]| { - paxhtml::Element::from_iter( - children - .iter() - .skip_while(|node| matches!(node.value, WSN::ParagraphBreak | WSN::Newline)) - .map(|node| { - convert_wikitext_to_html(templates, pwt_configuration, &node.value, page_context) - }), - ) - }; + let convert_children = + |templates: &mut Templates, children: &[Spanned]| { + paxhtml::Element::from_iter( + children + .iter() + .skip_while(|node| matches!(node.value, WSN::ParagraphBreak | WSN::Newline)) + .map(|node| { + convert_wikitext_to_html( + templates, + pwt_configuration, + &node.value, + page_context, + ) + }), + ) + }; match node { WSN::Fragment { children } => convert_children(templates, children), @@ -649,7 +748,10 @@ fn convert_wikitext_to_html( let instantiated = templates.instantiate( pwt_configuration, TemplateToInstantiate::Node(WikitextSimplifiedNode::Fragment { - children: attributes.iter().map(|n| empty_spanned(n.value.clone())).collect(), + children: attributes + .iter() + .map(|n| empty_spanned(n.value.clone())) + .collect(), }), &[], page_context, @@ -677,7 +779,10 @@ fn convert_wikitext_to_html( })); } - let unwrapped_attributes: Vec = modified_attributes.iter().map(|s| s.value.clone()).collect(); + let unwrapped_attributes: Vec = modified_attributes + .iter() + .map(|s| s.value.clone()) + .collect(); let attributes = parse_attributes_from_wsn( templates, pwt_configuration, @@ -834,3 +939,136 @@ fn redirect(to_url: &str) -> paxhtml::Document { }, ]) } + +/// Tokenize text into searchable words +fn tokenize_text(text: &str) -> Vec { + text.to_lowercase() + .split(|c: char| !c.is_alphanumeric()) + .filter(|word| word.len() >= 2) // Skip single characters + .map(|word| word.to_string()) + .collect() +} + +/// Extract plain text and headings from wikitext AST for search indexing +fn extract_search_data(node: &WikitextSimplifiedNode) -> (String, Vec) { + use WikitextSimplifiedNode as WSN; + + let mut text = String::new(); + let mut headings = Vec::new(); + + fn extract_recursive(node: &WSN, text: &mut String, headings: &mut Vec) { + match node { + WSN::Fragment { children } + | WSN::Bold { children } + | WSN::Italic { children } + | WSN::Blockquote { children } + | WSN::Superscript { children } + | WSN::Subscript { children } + | WSN::Small { children } + | WSN::Preformatted { children } => { + for child in children { + extract_recursive(&child.value, text, headings); + } + } + WSN::Heading { level: _, children } => { + let mut heading_text = String::new(); + for child in children { + extract_text_only(&child.value, &mut heading_text); + } + let heading_trimmed = heading_text.trim().to_string(); + if !heading_trimmed.is_empty() { + headings.push(heading_trimmed.clone()); + text.push_str(&heading_trimmed); + text.push(' '); + } + } + WSN::Link { + text: link_text, + title: _, + } => { + text.push_str(link_text); + text.push(' '); + } + WSN::ExtLink { + link: _, + text: link_text, + } => { + if let Some(t) = link_text { + text.push_str(t); + text.push(' '); + } + } + WSN::Text { text: t } => { + text.push_str(t); + text.push(' '); + } + WSN::Tag { children, .. } => { + for child in children { + extract_recursive(&child.value, text, headings); + } + } + WSN::Table { captions, rows, .. } => { + // Extract text from table captions + for caption in captions { + for node in &caption.content { + extract_recursive(&node.value, text, headings); + } + } + // Extract text from table cells + for row in rows { + for cell in &row.cells { + for node in &cell.content { + extract_recursive(&node.value, text, headings); + } + } + } + } + WSN::OrderedList { items } | WSN::UnorderedList { items } => { + for item in items { + for node in &item.content { + extract_recursive(&node.value, text, headings); + } + } + } + WSN::DefinitionList { items } => { + for item in items { + for node in &item.content { + extract_recursive(&node.value, text, headings); + } + } + } + WSN::Template { .. } + | WSN::TemplateParameterUse { .. } + | WSN::Redirect { .. } + | WSN::HorizontalDivider + | WSN::ParagraphBreak + | WSN::Newline => { + // Skip templates, parameters, and formatting elements + } + } + } + + fn extract_text_only(node: &WSN, text: &mut String) { + match node { + WSN::Text { text: t } => { + text.push_str(t); + } + WSN::Fragment { children } + | WSN::Bold { children } + | WSN::Italic { children } + | WSN::Heading { children, .. } => { + for child in children { + extract_text_only(&child.value, text); + } + } + _ => {} + } + } + + extract_recursive(node, &mut text, &mut headings); + + // Normalize whitespace + let normalized = text.split_whitespace().collect::>().join(" "); + + (normalized, headings) +} diff --git a/src/template.rs b/src/template.rs index 58100aa..e63eaa8 100644 --- a/src/template.rs +++ b/src/template.rs @@ -401,11 +401,13 @@ mod tests { // Verify the result is a table (possibly wrapped in a Fragment) let table_node = match &result { WikitextSimplifiedNode::Table { .. } => &result, - WikitextSimplifiedNode::Fragment { children } => &children - .iter() - .find(|node| matches!(node.value, WikitextSimplifiedNode::Table { .. })) - .expect("Fragment should contain a Table node") - .value, + WikitextSimplifiedNode::Fragment { children } => { + &children + .iter() + .find(|node| matches!(node.value, WikitextSimplifiedNode::Table { .. })) + .expect("Fragment should contain a Table node") + .value + } _ => panic!( "Expected Table or Fragment with Table node, got {:?}", result diff --git a/static/js/search.js b/static/js/search.js new file mode 100644 index 0000000..c2a5ffd --- /dev/null +++ b/static/js/search.js @@ -0,0 +1,305 @@ +/** + * JC2-MP Wiki Search Implementation + * Ultra-compact inverted index with on-demand text loading + */ + +class WikiSearch { + constructor() { + this.searchIndex = null; + this.isLoading = false; + this.isLoaded = false; + this.textCache = new Map(); // Cache loaded text files + } + + /** + * Load the search index from JSON file + */ + async loadIndex() { + if (this.isLoaded || this.isLoading) { + return; + } + + this.isLoading = true; + try { + const response = await fetch('/search-index.json'); + if (!response.ok) { + throw new Error('Failed to load search index'); + } + this.searchIndex = await response.json(); + this.isLoaded = true; + } catch (error) { + console.error('Error loading search index:', error); + this.searchIndex = { pages: [], words: {} }; + } finally { + this.isLoading = false; + } + } + + /** + * Derive URL from page title + */ + titleToUrl(title) { + return '/wiki/' + title.replace(/ /g, '_') + '.html'; + } + + /** + * Load text content for a specific page (with caching) + */ + async loadPageText(title) { + const url = this.titleToUrl(title); + if (this.textCache.has(url)) { + return this.textCache.get(url); + } + + try { + const textUrl = url.replace('.html', '.txt'); + const response = await fetch(textUrl); + if (!response.ok) { + return ''; + } + const text = await response.text(); + this.textCache.set(url, text); + return text; + } catch (error) { + console.error(`Error loading text for ${title}:`, error); + return ''; + } + } + + /** + * Tokenize text into searchable words (matches Rust implementation) + */ + tokenize(text) { + return text + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter(word => word.length >= 2); + } + + /** + * Extract snippet from content showing where the query appears + */ + extractSnippet(content, query, maxLength = 150) { + const normalizedContent = content.toLowerCase(); + const queryWords = this.tokenize(query); + + // Find the first occurrence of any query word + let bestIndex = -1; + for (const word of queryWords) { + const index = normalizedContent.indexOf(word); + if (index !== -1 && (bestIndex === -1 || index < bestIndex)) { + bestIndex = index; + } + } + + if (bestIndex === -1) { + // No query words found, return start of content + return content.substring(0, maxLength) + (content.length > maxLength ? '...' : ''); + } + + // Calculate snippet bounds to center the query + const snippetStart = Math.max(0, bestIndex - Math.floor(maxLength / 2)); + const snippetEnd = Math.min(content.length, snippetStart + maxLength); + + let snippet = content.substring(snippetStart, snippetEnd); + + // Add ellipsis if needed + if (snippetStart > 0) { + snippet = '...' + snippet; + } + if (snippetEnd < content.length) { + snippet = snippet + '...'; + } + + return snippet; + } + + /** + * Search the index and return ranked results + */ + search(query, limit = 20) { + if (!query || !this.isLoaded || !this.searchIndex) { + return []; + } + + const queryWords = this.tokenize(query); + if (queryWords.length === 0) { + return []; + } + + // Map to track page index -> total score + const pageScores = new Map(); + + // For each query word, look up pages in the inverted index + for (const word of queryWords) { + const pageWeights = this.searchIndex.words[word]; + if (!pageWeights) { + continue; // Word not found in index + } + + // Add weighted scores for each page + for (const [pageIdx, weight] of pageWeights) { + pageScores.set(pageIdx, (pageScores.get(pageIdx) || 0) + weight); + } + } + + // Convert to array with page titles + const results = []; + for (const [pageIdx, score] of pageScores.entries()) { + const title = this.searchIndex.pages[pageIdx]; + if (title) { + results.push({ + title: title, + url: this.titleToUrl(title), + score: score, + }); + } + } + + // Sort by score (descending) and limit results + results.sort((a, b) => b.score - a.score); + return results.slice(0, limit); + } + + /** + * Highlight query terms in text + */ + highlightTerms(text, query) { + if (!query || !text) { + return text; + } + + const queryWords = this.tokenize(query); + let highlightedText = text; + + // Sort query words by length (longest first) to avoid partial replacements + queryWords.sort((a, b) => b.length - a.length); + + for (const word of queryWords) { + if (!word) continue; + + // Create regex to match word case-insensitively (word boundaries) + const regex = new RegExp(`\\b(${this.escapeRegex(word)})\\b`, 'gi'); + highlightedText = highlightedText.replace(regex, '$1'); + } + + return highlightedText; + } + + /** + * Escape special regex characters + */ + escapeRegex(str) { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + } +} + +// Global search instance +const wikiSearch = new WikiSearch(); + +/** + * Initialize search UI + */ +document.addEventListener('DOMContentLoaded', () => { + const searchInput = document.getElementById('wiki-search-input'); + const searchResults = document.getElementById('wiki-search-results'); + + if (!searchInput || !searchResults) { + return; + } + + // Load search index on first interaction + let indexLoadStarted = false; + searchInput.addEventListener('focus', async () => { + if (!indexLoadStarted) { + indexLoadStarted = true; + await wikiSearch.loadIndex(); + } + }); + + // Debounce search to avoid excessive searches while typing + let searchTimeout; + searchInput.addEventListener('input', (e) => { + clearTimeout(searchTimeout); + + const query = e.target.value.trim(); + + if (!query) { + searchResults.innerHTML = ''; + searchResults.classList.add('hidden'); + return; + } + + searchTimeout = setTimeout(async () => { + // Ensure index is loaded + if (!wikiSearch.isLoaded) { + await wikiSearch.loadIndex(); + } + + const results = wikiSearch.search(query); + displaySearchResults(results, query); + }, 300); // 300ms debounce + }); + + // Hide results when clicking outside + document.addEventListener('click', (e) => { + if (!searchInput.contains(e.target) && !searchResults.contains(e.target)) { + searchResults.classList.add('hidden'); + } + }); + + // Show results when input is focused and has content + searchInput.addEventListener('focus', () => { + if (searchInput.value.trim() && searchResults.children.length > 0) { + searchResults.classList.remove('hidden'); + } + }); + + /** + * Display search results in the UI + */ + async function displaySearchResults(results, query) { + if (results.length === 0) { + searchResults.innerHTML = ` +
+ No results found for "${query}" +
+ `; + searchResults.classList.remove('hidden'); + return; + } + + // Load text content for all results to show snippets + // Average text file is ~300 bytes, so 20 results = ~6KB total + const snippetPromises = results.map(async (result) => { + const text = await wikiSearch.loadPageText(result.title); + return { + ...result, + snippet: text ? wikiSearch.extractSnippet(text, query) : null + }; + }); + + const resultsWithSnippets = await Promise.all(snippetPromises); + + const resultsHTML = resultsWithSnippets.map(result => { + const highlightedTitle = wikiSearch.highlightTerms(result.title, query); + + // Show snippet if available + let detailsHTML = ''; + if (result.snippet) { + const highlightedSnippet = wikiSearch.highlightTerms(result.snippet, query); + detailsHTML = `
${highlightedSnippet}
`; + } + + return ` + +
${highlightedTitle}
+ ${detailsHTML} +
+ `; + }).join(''); + + searchResults.innerHTML = resultsHTML; + searchResults.classList.remove('hidden'); + } +});