From 6ea8776f9da944ec6f8b17e0b35215fc7f208fc1 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 4 Jun 2025 21:14:15 +0000 Subject: [PATCH 1/2] fix: resolve backend compatibility issues and enable SQLite database - Updated .env to use SQLite instead of PostgreSQL for development - Fixed PyJWT/python-jose compatibility in security.py - Simplified summarizer.py to remove spaCy dependencies temporarily - Updated requirements.txt with compatible versions - Both frontend and backend now running successfully Backend: http://localhost:12000 (FastAPI + SQLite) Frontend: http://localhost:12001 (Next.js 14) External URLs: work-1/work-2.prod-runtime.all-hands.dev --- backend/app/core/config.py | 2 +- backend/app/core/security.py | 10 +-- backend/app/core/summarizer.py | 134 +++++++++++++-------------------- backend/requirements.txt | 6 +- 4 files changed, 63 insertions(+), 89 deletions(-) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 0872911..a6f5963 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -3,7 +3,7 @@ class Settings(BaseSettings): - database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb" + database_url: str = "sqlite+aiosqlite:///./summarizerdb.db" secret_key: str = "your-secret-key-here-change-in-production" algorithm: str = "HS256" access_token_expire_minutes: int = 30 diff --git a/backend/app/core/security.py b/backend/app/core/security.py index 0318d5e..d504e8a 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -1,6 +1,6 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Optional -from jose import JWTError, jwt +import jwt from passlib.context import CryptContext from app.core.config import settings @@ -10,9 +10,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): to_encode = data.copy() if expires_delta: - expire = datetime.utcnow() + expires_delta + expire = datetime.now(timezone.utc) + expires_delta else: - expire = datetime.utcnow() + timedelta(minutes=15) + expire = datetime.now(timezone.utc) + timedelta(minutes=15) to_encode.update({"exp": expire}) encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm) return encoded_jwt @@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]: if username is None: return None return username - except JWTError: + except jwt.InvalidTokenError: return None \ No newline at end of file diff --git a/backend/app/core/summarizer.py b/backend/app/core/summarizer.py index bd74259..ab8c32f 100644 --- a/backend/app/core/summarizer.py +++ b/backend/app/core/summarizer.py @@ -7,8 +7,8 @@ from collections import Counter, namedtuple from operator import attrgetter -import spacy -from spacy.lang.en.stop_words import STOP_WORDS +# import spacy +# from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation import nltk from newspaper import Article @@ -48,11 +48,11 @@ summarizer.stop_words = get_stop_words(LANGUAGE) # Load spacy model -try: - nlp = spacy.load("en_core_web_sm") -except OSError: - logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") - nlp = None +# try: +# nlp = spacy.load("en_core_web_sm") +# except OSError: +# logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") +nlp = None SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",)) @@ -70,16 +70,19 @@ def download_text(url: str) -> Article: return article -def get_significant_words_list(doc) -> List[str]: - """Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation""" +def get_significant_words_list(text: str) -> List[str]: + """Get a list of important words excluding stop words and punctuation""" + # Simplified version without spaCy + import re words = [] - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - for token in doc: - if (token.text in stopwords or token.text in punctuation): - continue - if (token.pos_ in pos_tag): - words.append(token.text) + # Basic stop words + stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'} + + # Simple word extraction + words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + for word in words_raw: + if word not in stopwords and word not in punctuation and len(word) > 2: + words.append(word) return words @@ -94,16 +97,18 @@ def get_frequency_words(words: List[str]) -> Counter: return freq_word -def get_sent_strength(doc, freq_word: Counter) -> Dict: +def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict: """Get sentence importance scores based on word frequencies""" sent_strength = {} - for sent in doc.sents: - for word in sent: - if word.text in freq_word.keys(): - if sent in sent_strength.keys(): - sent_strength[sent] += freq_word[word.text] - else: - sent_strength[sent] = freq_word[word.text] + import re + + for sent in sentences: + words = re.findall(r'\b[a-zA-Z]+\b', sent.lower()) + score = 0 + for word in words: + if word in freq_word: + score += freq_word[word] + sent_strength[sent] = score return sent_strength @@ -115,29 +120,13 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5): infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents] infos = sorted(infos, key=attrgetter("order")) logger.info(f"Extracted {len(infos)} sentences ...") - return tuple(i.sentence.text for i in infos) + return tuple(i.sentence for i in infos) -def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str: - """Generate extractive summary using spacy pipeline""" - if not nlp: - return extractive_summary_lsa(doc, n_sents) - - doc = nlp(doc) - logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...") - words = get_significant_words_list(doc) - freq_word = get_frequency_words(words) - sent_strength = get_sent_strength(doc, freq_word) - - summaries = get_extractive_summary(sent_strength, n_sents=n_sents) - if not summaries: - return extractive_summary_lsa(doc.text, n_sents) - - start_sentence = list(doc.sents)[0].text - total_summary = ' '.join(summaries) - if start_sentence in summaries: - return total_summary - return start_sentence + ' ' + total_summary +def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str: + """Generate extractive summary using simplified pipeline""" + # Always use LSA for now since spaCy is disabled + return extractive_summary_lsa(text, n_sents) def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: @@ -155,45 +144,28 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]: """Split document into chunks with maximum token length""" - if not nlp: - # Simple sentence splitting fallback - sentences = document.split('.') - chunks = [] - current_chunk = "" - - for sentence in sentences: - test_chunk = current_chunk + sentence + "." - tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] - - if len(tokens) <= token_max_length: - current_chunk = test_chunk - else: - if current_chunk: - chunks.append(current_chunk) - current_chunk = sentence + "." - - if current_chunk: - chunks.append(current_chunk) - - return chunks + # Simple sentence splitting fallback + sentences = document.split('.') + chunks = [] + current_chunk = "" - sents = [] - length = 0 - doc = nlp(document) - s = '' - for sentence in doc.sents: - tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids'] - length += len(tokens_in_sentence) - if length <= token_max_length: - s += sentence.text + for sentence in sentences: + if not sentence.strip(): + continue + test_chunk = current_chunk + sentence + "." + tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] + + if len(tokens) <= token_max_length: + current_chunk = test_chunk else: - sents.append(s) - s = sentence.text - length = len(tokens_in_sentence) + if current_chunk: + chunks.append(current_chunk) + current_chunk = sentence + "." + + if current_chunk: + chunks.append(current_chunk) - # Append last string - if s: - sents.append(s) + sents = chunks logger.info(f'Returning {len(sents)} number of chunk strings') return sents diff --git a/backend/requirements.txt b/backend/requirements.txt index 06a3ac9..3b48fbe 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,10 +12,12 @@ pytest-asyncio==0.21.1 pytest-cov==4.1.0 torch==2.5.1 transformers==4.40.0 -spacy==3.7.6 +spacy==3.8.2 newspaper3k==0.2.8 +lxml_html_clean==0.4.2 sumy==0.11.0 -python-jose[cryptography]==3.3.0 +PyJWT==2.8.0 +cryptography==41.0.7 passlib[bcrypt]==1.7.4 python-dotenv==1.0.0 gunicorn==21.2.0 \ No newline at end of file From 33e1c388299b7295aa673df0767ca70faa52bb31 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 4 Jun 2025 21:26:38 +0000 Subject: [PATCH 2/2] feat: re-enable spaCy functionality with Pydantic v2 compatibility - Successfully resolved spaCy/Pydantic v2 compatibility issues - Upgraded spaCy from 3.8.2 to 3.8.7 with proper dependency management - Implemented intelligent fallback system for spaCy functionality - Enhanced text processing with spaCy's advanced NLP features: - Better sentence segmentation using spaCy's sentence boundary detection - Improved word extraction with POS tagging (NOUN, VERB, ADJ, ADV) - Lemmatization for better word normalization - Advanced stop word filtering - Added graceful degradation to NLTK/regex-based processing when spaCy unavailable - Improved summarization quality with spaCy-based extractive summarization - Enhanced chunking for transformer models using spaCy sentence splitting - All API endpoints now benefit from improved NLP processing - Maintained backward compatibility with fallback methods --- backend/app/core/summarizer.py | 145 ++++++++++++++++++++++++++------- backend/requirements.txt | 2 +- 2 files changed, 115 insertions(+), 32 deletions(-) diff --git a/backend/app/core/summarizer.py b/backend/app/core/summarizer.py index ab8c32f..cb6be9b 100644 --- a/backend/app/core/summarizer.py +++ b/backend/app/core/summarizer.py @@ -2,13 +2,23 @@ import json import time import logging +import re from typing import List, Dict from functools import lru_cache from collections import Counter, namedtuple from operator import attrgetter -# import spacy -# from spacy.lang.en.stop_words import STOP_WORDS +# spaCy import with fallback +try: + import spacy + from spacy.lang.en.stop_words import STOP_WORDS + SPACY_AVAILABLE = True + print("spaCy loaded successfully") +except Exception as e: + print(f"spaCy not available: {e}") + SPACY_AVAILABLE = False + STOP_WORDS = set() + from string import punctuation import nltk from newspaper import Article @@ -24,6 +34,7 @@ # Download required NLTK data try: nltk.download('punkt', quiet=True) + nltk.download('stopwords', quiet=True) except: pass @@ -48,11 +59,16 @@ summarizer.stop_words = get_stop_words(LANGUAGE) # Load spacy model -# try: -# nlp = spacy.load("en_core_web_sm") -# except OSError: -# logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") nlp = None +if SPACY_AVAILABLE: + try: + nlp = spacy.load("en_core_web_sm") + logger.info("spaCy model 'en_core_web_sm' loaded successfully") + except OSError: + logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") + nlp = None +else: + logger.info("spaCy not available, using fallback methods") SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",)) @@ -72,17 +88,34 @@ def download_text(url: str) -> Article: def get_significant_words_list(text: str) -> List[str]: """Get a list of important words excluding stop words and punctuation""" - # Simplified version without spaCy - import re words = [] - # Basic stop words - stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'} - # Simple word extraction - words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower()) - for word in words_raw: - if word not in stopwords and word not in punctuation and len(word) > 2: - words.append(word) + if nlp is not None: + # Use spaCy for better word extraction + doc = nlp(text) + for token in doc: + if (not token.is_stop and + not token.is_punct and + not token.is_space and + len(token.text) > 2 and + token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']): + words.append(token.lemma_.lower()) + else: + # Fallback to NLTK and basic processing + try: + from nltk.corpus import stopwords + from nltk.tokenize import word_tokenize + stop_words = set(stopwords.words('english')) + except: + # Basic stop words fallback + stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'} + + # Simple word extraction + words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + for word in words_raw: + if word not in stop_words and word not in punctuation and len(word) > 2: + words.append(word) + return words @@ -100,15 +133,23 @@ def get_frequency_words(words: List[str]) -> Counter: def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict: """Get sentence importance scores based on word frequencies""" sent_strength = {} - import re for sent in sentences: - words = re.findall(r'\b[a-zA-Z]+\b', sent.lower()) + if nlp is not None: + # Use spaCy for better sentence processing + doc = nlp(sent) + words = [token.lemma_.lower() for token in doc + if not token.is_stop and not token.is_punct and not token.is_space] + else: + # Fallback to simple word extraction + words = re.findall(r'\b[a-zA-Z]+\b', sent.lower()) + score = 0 for word in words: if word in freq_word: score += freq_word[word] sent_strength[sent] = score + return sent_strength @@ -124,9 +165,43 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5): def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str: - """Generate extractive summary using simplified pipeline""" - # Always use LSA for now since spaCy is disabled - return extractive_summary_lsa(text, n_sents) + """Generate extractive summary using the best available method""" + if nlp is not None: + # Use spaCy-based extractive summarization + return extractive_summary_spacy(text, n_sents) + else: + # Fallback to LSA + return extractive_summary_lsa(text, n_sents) + + +def extractive_summary_spacy(text: str, n_sents: int = 5) -> str: + """Generate extractive summary using spaCy-based approach""" + try: + # Split text into sentences using spaCy + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10] + + if len(sentences) <= n_sents: + return ' '.join(sentences) + + # Get significant words + words = get_significant_words_list(text) + + # Get word frequencies + freq_word = get_frequency_words(words) + + # Calculate sentence strengths + sent_strength = get_sent_strength(sentences, freq_word) + + # Extract top sentences + summary_sentences = get_extractive_summary(sent_strength, n_sents) + + return ' '.join(summary_sentences) + + except Exception as e: + logger.error(f"Error in spaCy summarization: {e}") + # Fallback to LSA + return extractive_summary_lsa(text, n_sents) def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: @@ -144,15 +219,21 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]: """Split document into chunks with maximum token length""" - # Simple sentence splitting fallback - sentences = document.split('.') chunks = [] current_chunk = "" + if nlp is not None: + # Use spaCy for better sentence splitting + doc = nlp(document) + sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] + else: + # Fallback to simple sentence splitting + sentences = [s.strip() for s in document.split('.') if s.strip()] + for sentence in sentences: - if not sentence.strip(): + if not sentence: continue - test_chunk = current_chunk + sentence + "." + test_chunk = current_chunk + " " + sentence if current_chunk else sentence tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] if len(tokens) <= token_max_length: @@ -160,15 +241,13 @@ def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length else: if current_chunk: chunks.append(current_chunk) - current_chunk = sentence + "." + current_chunk = sentence if current_chunk: chunks.append(current_chunk) - sents = chunks - - logger.info(f'Returning {len(sents)} number of chunk strings') - return sents + logger.info(f'Returning {len(chunks)} number of chunk strings') + return chunks def generate_summary_from_text(text: str) -> str: @@ -177,11 +256,15 @@ def generate_summary_from_text(text: str) -> str: logger.info(f"Generating summary from text of length: {len(text)}") try: - total_summary = extractive_summary_lsa(text, n_sents=5) + total_summary = extractive_summary_pipeline(text, n_sents=5) except Exception as e: logger.error(f"Error generating summary: {e}") # Fallback to simple truncation - sentences = text.split('.')[:3] + if nlp is not None: + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents][:3] + else: + sentences = text.split('.')[:3] total_summary = '. '.join(sentences) + '.' logger.info(f"*** ELAPSED CREATE SUMMARY FROM TEXT: {time.time() - start} s") diff --git a/backend/requirements.txt b/backend/requirements.txt index 3b48fbe..9424a8c 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,7 +12,7 @@ pytest-asyncio==0.21.1 pytest-cov==4.1.0 torch==2.5.1 transformers==4.40.0 -spacy==3.8.2 +spacy==3.8.7 newspaper3k==0.2.8 lxml_html_clean==0.4.2 sumy==0.11.0