diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 0872911..a6f5963 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -3,7 +3,7 @@ class Settings(BaseSettings): - database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb" + database_url: str = "sqlite+aiosqlite:///./summarizerdb.db" secret_key: str = "your-secret-key-here-change-in-production" algorithm: str = "HS256" access_token_expire_minutes: int = 30 diff --git a/backend/app/core/security.py b/backend/app/core/security.py index 0318d5e..d504e8a 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -1,6 +1,6 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Optional -from jose import JWTError, jwt +import jwt from passlib.context import CryptContext from app.core.config import settings @@ -10,9 +10,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): to_encode = data.copy() if expires_delta: - expire = datetime.utcnow() + expires_delta + expire = datetime.now(timezone.utc) + expires_delta else: - expire = datetime.utcnow() + timedelta(minutes=15) + expire = datetime.now(timezone.utc) + timedelta(minutes=15) to_encode.update({"exp": expire}) encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm) return encoded_jwt @@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]: if username is None: return None return username - except JWTError: + except jwt.InvalidTokenError: return None \ No newline at end of file diff --git a/backend/app/core/summarizer.py b/backend/app/core/summarizer.py index bd74259..cb6be9b 100644 --- a/backend/app/core/summarizer.py +++ b/backend/app/core/summarizer.py @@ -2,13 +2,23 @@ import json import time import logging +import re from typing import List, Dict from functools import lru_cache from collections import Counter, namedtuple from operator import attrgetter -import spacy -from spacy.lang.en.stop_words import STOP_WORDS +# spaCy import with fallback +try: + import spacy + from spacy.lang.en.stop_words import STOP_WORDS + SPACY_AVAILABLE = True + print("spaCy loaded successfully") +except Exception as e: + print(f"spaCy not available: {e}") + SPACY_AVAILABLE = False + STOP_WORDS = set() + from string import punctuation import nltk from newspaper import Article @@ -24,6 +34,7 @@ # Download required NLTK data try: nltk.download('punkt', quiet=True) + nltk.download('stopwords', quiet=True) except: pass @@ -48,11 +59,16 @@ summarizer.stop_words = get_stop_words(LANGUAGE) # Load spacy model -try: - nlp = spacy.load("en_core_web_sm") -except OSError: - logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") - nlp = None +nlp = None +if SPACY_AVAILABLE: + try: + nlp = spacy.load("en_core_web_sm") + logger.info("spaCy model 'en_core_web_sm' loaded successfully") + except OSError: + logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") + nlp = None +else: + logger.info("spaCy not available, using fallback methods") SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",)) @@ -70,16 +86,36 @@ def download_text(url: str) -> Article: return article -def get_significant_words_list(doc) -> List[str]: - """Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation""" +def get_significant_words_list(text: str) -> List[str]: + """Get a list of important words excluding stop words and punctuation""" words = [] - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - for token in doc: - if (token.text in stopwords or token.text in punctuation): - continue - if (token.pos_ in pos_tag): - words.append(token.text) + + if nlp is not None: + # Use spaCy for better word extraction + doc = nlp(text) + for token in doc: + if (not token.is_stop and + not token.is_punct and + not token.is_space and + len(token.text) > 2 and + token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']): + words.append(token.lemma_.lower()) + else: + # Fallback to NLTK and basic processing + try: + from nltk.corpus import stopwords + from nltk.tokenize import word_tokenize + stop_words = set(stopwords.words('english')) + except: + # Basic stop words fallback + stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'} + + # Simple word extraction + words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + for word in words_raw: + if word not in stop_words and word not in punctuation and len(word) > 2: + words.append(word) + return words @@ -94,16 +130,26 @@ def get_frequency_words(words: List[str]) -> Counter: return freq_word -def get_sent_strength(doc, freq_word: Counter) -> Dict: +def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict: """Get sentence importance scores based on word frequencies""" sent_strength = {} - for sent in doc.sents: - for word in sent: - if word.text in freq_word.keys(): - if sent in sent_strength.keys(): - sent_strength[sent] += freq_word[word.text] - else: - sent_strength[sent] = freq_word[word.text] + + for sent in sentences: + if nlp is not None: + # Use spaCy for better sentence processing + doc = nlp(sent) + words = [token.lemma_.lower() for token in doc + if not token.is_stop and not token.is_punct and not token.is_space] + else: + # Fallback to simple word extraction + words = re.findall(r'\b[a-zA-Z]+\b', sent.lower()) + + score = 0 + for word in words: + if word in freq_word: + score += freq_word[word] + sent_strength[sent] = score + return sent_strength @@ -115,29 +161,47 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5): infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents] infos = sorted(infos, key=attrgetter("order")) logger.info(f"Extracted {len(infos)} sentences ...") - return tuple(i.sentence.text for i in infos) + return tuple(i.sentence for i in infos) -def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str: - """Generate extractive summary using spacy pipeline""" - if not nlp: - return extractive_summary_lsa(doc, n_sents) - - doc = nlp(doc) - logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...") - words = get_significant_words_list(doc) - freq_word = get_frequency_words(words) - sent_strength = get_sent_strength(doc, freq_word) - - summaries = get_extractive_summary(sent_strength, n_sents=n_sents) - if not summaries: - return extractive_summary_lsa(doc.text, n_sents) - - start_sentence = list(doc.sents)[0].text - total_summary = ' '.join(summaries) - if start_sentence in summaries: - return total_summary - return start_sentence + ' ' + total_summary +def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str: + """Generate extractive summary using the best available method""" + if nlp is not None: + # Use spaCy-based extractive summarization + return extractive_summary_spacy(text, n_sents) + else: + # Fallback to LSA + return extractive_summary_lsa(text, n_sents) + + +def extractive_summary_spacy(text: str, n_sents: int = 5) -> str: + """Generate extractive summary using spaCy-based approach""" + try: + # Split text into sentences using spaCy + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10] + + if len(sentences) <= n_sents: + return ' '.join(sentences) + + # Get significant words + words = get_significant_words_list(text) + + # Get word frequencies + freq_word = get_frequency_words(words) + + # Calculate sentence strengths + sent_strength = get_sent_strength(sentences, freq_word) + + # Extract top sentences + summary_sentences = get_extractive_summary(sent_strength, n_sents) + + return ' '.join(summary_sentences) + + except Exception as e: + logger.error(f"Error in spaCy summarization: {e}") + # Fallback to LSA + return extractive_summary_lsa(text, n_sents) def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: @@ -155,48 +219,35 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]: """Split document into chunks with maximum token length""" - if not nlp: - # Simple sentence splitting fallback - sentences = document.split('.') - chunks = [] - current_chunk = "" - - for sentence in sentences: - test_chunk = current_chunk + sentence + "." - tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] - - if len(tokens) <= token_max_length: - current_chunk = test_chunk - else: - if current_chunk: - chunks.append(current_chunk) - current_chunk = sentence + "." - - if current_chunk: - chunks.append(current_chunk) - - return chunks + chunks = [] + current_chunk = "" - sents = [] - length = 0 - doc = nlp(document) - s = '' - for sentence in doc.sents: - tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids'] - length += len(tokens_in_sentence) - if length <= token_max_length: - s += sentence.text + if nlp is not None: + # Use spaCy for better sentence splitting + doc = nlp(document) + sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] + else: + # Fallback to simple sentence splitting + sentences = [s.strip() for s in document.split('.') if s.strip()] + + for sentence in sentences: + if not sentence: + continue + test_chunk = current_chunk + " " + sentence if current_chunk else sentence + tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] + + if len(tokens) <= token_max_length: + current_chunk = test_chunk else: - sents.append(s) - s = sentence.text - length = len(tokens_in_sentence) + if current_chunk: + chunks.append(current_chunk) + current_chunk = sentence - # Append last string - if s: - sents.append(s) + if current_chunk: + chunks.append(current_chunk) - logger.info(f'Returning {len(sents)} number of chunk strings') - return sents + logger.info(f'Returning {len(chunks)} number of chunk strings') + return chunks def generate_summary_from_text(text: str) -> str: @@ -205,11 +256,15 @@ def generate_summary_from_text(text: str) -> str: logger.info(f"Generating summary from text of length: {len(text)}") try: - total_summary = extractive_summary_lsa(text, n_sents=5) + total_summary = extractive_summary_pipeline(text, n_sents=5) except Exception as e: logger.error(f"Error generating summary: {e}") # Fallback to simple truncation - sentences = text.split('.')[:3] + if nlp is not None: + doc = nlp(text) + sentences = [sent.text.strip() for sent in doc.sents][:3] + else: + sentences = text.split('.')[:3] total_summary = '. '.join(sentences) + '.' logger.info(f"*** ELAPSED CREATE SUMMARY FROM TEXT: {time.time() - start} s") diff --git a/backend/requirements.txt b/backend/requirements.txt index 06a3ac9..9424a8c 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,10 +12,12 @@ pytest-asyncio==0.21.1 pytest-cov==4.1.0 torch==2.5.1 transformers==4.40.0 -spacy==3.7.6 +spacy==3.8.7 newspaper3k==0.2.8 +lxml_html_clean==0.4.2 sumy==0.11.0 -python-jose[cryptography]==3.3.0 +PyJWT==2.8.0 +cryptography==41.0.7 passlib[bcrypt]==1.7.4 python-dotenv==1.0.0 gunicorn==21.2.0 \ No newline at end of file