gianpd · gianpd · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -3,7 +3,7 @@
 
 
 class Settings(BaseSettings):
-    database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb"
+    database_url: str = "sqlite+aiosqlite:///./summarizerdb.db"
     secret_key: str = "your-secret-key-here-change-in-production"
     algorithm: str = "HS256"
     access_token_expire_minutes: int = 30

diff --git a/backend/app/core/security.py b/backend/app/core/security.py
@@ -1,6 +1,6 @@
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import Optional
-from jose import JWTError, jwt
+import jwt
 from passlib.context import CryptContext
 from app.core.config import settings
 
@@ -10,9 +10,9 @@
 def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
     to_encode = data.copy()
     if expires_delta:
-        expire = datetime.utcnow() + expires_delta
+        expire = datetime.now(timezone.utc) + expires_delta
     else:
-        expire = datetime.utcnow() + timedelta(minutes=15)
+        expire = datetime.now(timezone.utc) + timedelta(minutes=15)
     to_encode.update({"exp": expire})
     encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm)
     return encoded_jwt
@@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]:
         if username is None:
             return None
         return username
-    except JWTError:
+    except jwt.InvalidTokenError:
         return None
diff --git a/backend/app/core/summarizer.py b/backend/app/core/summarizer.py
@@ -2,13 +2,23 @@
 import json
 import time
 import logging
+import re
 from typing import List, Dict
 from functools import lru_cache
 from collections import Counter, namedtuple
 from operator import attrgetter
 
-import spacy
-from spacy.lang.en.stop_words import STOP_WORDS
+# spaCy import with fallback
+try:
+    import spacy
+    from spacy.lang.en.stop_words import STOP_WORDS
+    SPACY_AVAILABLE = True
+    print("spaCy loaded successfully")
+except Exception as e:
+    print(f"spaCy not available: {e}")
+    SPACY_AVAILABLE = False
+    STOP_WORDS = set()
+
 from string import punctuation
 import nltk
 from newspaper import Article
@@ -24,6 +34,7 @@
 # Download required NLTK data
 try:
     nltk.download('punkt', quiet=True)
+    nltk.download('stopwords', quiet=True)
 except:
     pass
 
@@ -48,11 +59,16 @@
 summarizer.stop_words = get_stop_words(LANGUAGE)
 
 # Load spacy model
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
-    nlp = None
+nlp = None
+if SPACY_AVAILABLE:
+    try:
+        nlp = spacy.load("en_core_web_sm")
+        logger.info("spaCy model 'en_core_web_sm' loaded successfully")
+    except OSError:
+        logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
+        nlp = None
+else:
+    logger.info("spaCy not available, using fallback methods")
 
 SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",))
 
@@ -70,16 +86,36 @@ def download_text(url: str) -> Article:
     return article
 
 
-def get_significant_words_list(doc) -> List[str]:
-    """Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation"""
+def get_significant_words_list(text: str) -> List[str]:
+    """Get a list of important words excluding stop words and punctuation"""
     words = []
-    stopwords = list(STOP_WORDS)
-    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
-    for token in doc:
-        if (token.text in stopwords or token.text in punctuation):
-            continue
-        if (token.pos_ in pos_tag):
-            words.append(token.text)
+
+    if nlp is not None:
+        # Use spaCy for better word extraction
+        doc = nlp(text)
+        for token in doc:
+            if (not token.is_stop and 
+                not token.is_punct and 
+                not token.is_space and 
+                len(token.text) > 2 and
+                token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']):
+                words.append(token.lemma_.lower())
+    else:
+        # Fallback to NLTK and basic processing
+        try:
+            from nltk.corpus import stopwords
+            from nltk.tokenize import word_tokenize
+            stop_words = set(stopwords.words('english'))
+        except:
+            # Basic stop words fallback
+            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
+
+        # Simple word extraction
+        words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower())
+        for word in words_raw:
+            if word not in stop_words and word not in punctuation and len(word) > 2:
+                words.append(word)
+
     return words
 
 
@@ -94,16 +130,26 @@ def get_frequency_words(words: List[str]) -> Counter:
     return freq_word
 
 
-def get_sent_strength(doc, freq_word: Counter) -> Dict:
+def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict:
     """Get sentence importance scores based on word frequencies"""
     sent_strength = {}
-    for sent in doc.sents:
-        for word in sent:
-            if word.text in freq_word.keys():
-                if sent in sent_strength.keys():
-                    sent_strength[sent] += freq_word[word.text]
-                else:
-                    sent_strength[sent] = freq_word[word.text]
+
+    for sent in sentences:
+        if nlp is not None:
+            # Use spaCy for better sentence processing
+            doc = nlp(sent)
+            words = [token.lemma_.lower() for token in doc 
+                    if not token.is_stop and not token.is_punct and not token.is_space]
+        else:
+            # Fallback to simple word extraction
+            words = re.findall(r'\b[a-zA-Z]+\b', sent.lower())
+
+        score = 0
+        for word in words:
+            if word in freq_word:
+                score += freq_word[word]
+        sent_strength[sent] = score
+
     return sent_strength
 
 
@@ -115,29 +161,47 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5):
     infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents]
     infos = sorted(infos, key=attrgetter("order"))
     logger.info(f"Extracted {len(infos)} sentences ...")
-    return tuple(i.sentence.text for i in infos)
+    return tuple(i.sentence for i in infos)
 
 
-def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str:
-    """Generate extractive summary using spacy pipeline"""
-    if not nlp:
-        return extractive_summary_lsa(doc, n_sents)
-
-    doc = nlp(doc)
-    logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...")
-    words = get_significant_words_list(doc)
-    freq_word = get_frequency_words(words)
-    sent_strength = get_sent_strength(doc, freq_word)
-
-    summaries = get_extractive_summary(sent_strength, n_sents=n_sents)
-    if not summaries:
-        return extractive_summary_lsa(doc.text, n_sents)
-
-    start_sentence = list(doc.sents)[0].text
-    total_summary = ' '.join(summaries)
-    if start_sentence in summaries:
-        return total_summary
-    return start_sentence + ' ' + total_summary
+def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str:
+    """Generate extractive summary using the best available method"""
+    if nlp is not None:
+        # Use spaCy-based extractive summarization
+        return extractive_summary_spacy(text, n_sents)
+    else:
+        # Fallback to LSA
+        return extractive_summary_lsa(text, n_sents)
+
+
+def extractive_summary_spacy(text: str, n_sents: int = 5) -> str:
+    """Generate extractive summary using spaCy-based approach"""
+    try:
+        # Split text into sentences using spaCy
+        doc = nlp(text)
+        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
+
+        if len(sentences) <= n_sents:
+            return ' '.join(sentences)
+
+        # Get significant words
+        words = get_significant_words_list(text)
+
+        # Get word frequencies
+        freq_word = get_frequency_words(words)
+
+        # Calculate sentence strengths
+        sent_strength = get_sent_strength(sentences, freq_word)
+
+        # Extract top sentences
+        summary_sentences = get_extractive_summary(sent_strength, n_sents)
+
+        return ' '.join(summary_sentences)
+
+    except Exception as e:
+        logger.error(f"Error in spaCy summarization: {e}")
+        # Fallback to LSA
+        return extractive_summary_lsa(text, n_sents)
 
 
 def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:
@@ -155,48 +219,35 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:
 
 def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]:
     """Split document into chunks with maximum token length"""
-    if not nlp:
-        # Simple sentence splitting fallback
-        sentences = document.split('.')
-        chunks = []
-        current_chunk = ""
-
-        for sentence in sentences:
-            test_chunk = current_chunk + sentence + "."
-            tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']
-
-            if len(tokens) <= token_max_length:
-                current_chunk = test_chunk
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = sentence + "."
-
-        if current_chunk:
-            chunks.append(current_chunk)
-
-        return chunks
+    chunks = []
+    current_chunk = ""
 
-    sents = []
-    length = 0
-    doc = nlp(document)
-    s = ''
-    for sentence in doc.sents:
-        tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids']
-        length += len(tokens_in_sentence)
-        if length <= token_max_length:
-            s += sentence.text
+    if nlp is not None:
+        # Use spaCy for better sentence splitting
+        doc = nlp(document)
+        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
+    else:
+        # Fallback to simple sentence splitting
+        sentences = [s.strip() for s in document.split('.') if s.strip()]
+
+    for sentence in sentences:
+        if not sentence:
+            continue
+        test_chunk = current_chunk + " " + sentence if current_chunk else sentence
+        tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']
+
+        if len(tokens) <= token_max_length:
+            current_chunk = test_chunk
         else:
-            sents.append(s)
-            s = sentence.text
-            length = len(tokens_in_sentence)
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = sentence
 
-    # Append last string
-    if s:
-        sents.append(s)
+    if current_chunk:
+        chunks.append(current_chunk)
 
-    logger.info(f'Returning {len(sents)} number of chunk strings')
-    return sents
+    logger.info(f'Returning {len(chunks)} number of chunk strings')
+    return chunks
 
 
 def generate_summary_from_text(text: str) -> str:
@@ -205,11 +256,15 @@ def generate_summary_from_text(text: str) -> str:
     logger.info(f"Generating summary from text of length: {len(text)}")
 
     try:
-        total_summary = extractive_summary_lsa(text, n_sents=5)
+        total_summary = extractive_summary_pipeline(text, n_sents=5)
     except Exception as e:
         logger.error(f"Error generating summary: {e}")
         # Fallback to simple truncation
-        sentences = text.split('.')[:3]
+        if nlp is not None:
+            doc = nlp(text)
+            sentences = [sent.text.strip() for sent in doc.sents][:3]
+        else:
+            sentences = text.split('.')[:3]
         total_summary = '. '.join(sentences) + '.'
 
     logger.info(f"*** ELAPSED CREATE SUMMARY FROM TEXT: {time.time() - start} s")

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -12,10 +12,12 @@ pytest-asyncio==0.21.1
 pytest-cov==4.1.0
 torch==2.5.1
 transformers==4.40.0
-spacy==3.7.6
+spacy==3.8.7
 newspaper3k==0.2.8
+lxml_html_clean==0.4.2
 sumy==0.11.0
-python-jose[cryptography]==3.3.0
+PyJWT==2.8.0
+cryptography==41.0.7
 passlib[bcrypt]==1.7.4
 python-dotenv==1.0.0
 gunicorn==21.2.0