Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


class Settings(BaseSettings):
database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb"
database_url: str = "sqlite+aiosqlite:///./summarizerdb.db"
secret_key: str = "your-secret-key-here-change-in-production"
algorithm: str = "HS256"
access_token_expire_minutes: int = 30
Expand Down
10 changes: 5 additions & 5 deletions backend/app/core/security.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from typing import Optional
from jose import JWTError, jwt
import jwt
from passlib.context import CryptContext
from app.core.config import settings

Expand All @@ -10,9 +10,9 @@
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
to_encode = data.copy()
if expires_delta:
expire = datetime.utcnow() + expires_delta
expire = datetime.now(timezone.utc) + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=15)
expire = datetime.now(timezone.utc) + timedelta(minutes=15)
to_encode.update({"exp": expire})
encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm)
return encoded_jwt
Expand All @@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]:
if username is None:
return None
return username
except JWTError:
except jwt.InvalidTokenError:
return None
225 changes: 140 additions & 85 deletions backend/app/core/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,23 @@
import json
import time
import logging
import re
from typing import List, Dict
from functools import lru_cache
from collections import Counter, namedtuple
from operator import attrgetter

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# spaCy import with fallback
try:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
SPACY_AVAILABLE = True
print("spaCy loaded successfully")
except Exception as e:
print(f"spaCy not available: {e}")
SPACY_AVAILABLE = False
STOP_WORDS = set()

from string import punctuation
import nltk
from newspaper import Article
Expand All @@ -24,6 +34,7 @@
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
except:
pass

Expand All @@ -48,11 +59,16 @@
summarizer.stop_words = get_stop_words(LANGUAGE)

# Load spacy model
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
nlp = None
nlp = None
if SPACY_AVAILABLE:
try:
nlp = spacy.load("en_core_web_sm")
logger.info("spaCy model 'en_core_web_sm' loaded successfully")
except OSError:
logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
nlp = None
else:
logger.info("spaCy not available, using fallback methods")

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",))

Expand All @@ -70,16 +86,36 @@ def download_text(url: str) -> Article:
return article


def get_significant_words_list(doc) -> List[str]:
"""Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation"""
def get_significant_words_list(text: str) -> List[str]:
"""Get a list of important words excluding stop words and punctuation"""
words = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
if (token.text in stopwords or token.text in punctuation):
continue
if (token.pos_ in pos_tag):
words.append(token.text)

if nlp is not None:
# Use spaCy for better word extraction
doc = nlp(text)
for token in doc:
if (not token.is_stop and
not token.is_punct and
not token.is_space and
len(token.text) > 2 and
token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']):
words.append(token.lemma_.lower())
else:
# Fallback to NLTK and basic processing
try:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
except:
# Basic stop words fallback
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}

# Simple word extraction
words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower())
for word in words_raw:
if word not in stop_words and word not in punctuation and len(word) > 2:
words.append(word)

return words


Expand All @@ -94,16 +130,26 @@ def get_frequency_words(words: List[str]) -> Counter:
return freq_word


def get_sent_strength(doc, freq_word: Counter) -> Dict:
def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict:
"""Get sentence importance scores based on word frequencies"""
sent_strength = {}
for sent in doc.sents:
for word in sent:
if word.text in freq_word.keys():
if sent in sent_strength.keys():
sent_strength[sent] += freq_word[word.text]
else:
sent_strength[sent] = freq_word[word.text]

for sent in sentences:
if nlp is not None:
# Use spaCy for better sentence processing
doc = nlp(sent)
words = [token.lemma_.lower() for token in doc
if not token.is_stop and not token.is_punct and not token.is_space]
else:
# Fallback to simple word extraction
words = re.findall(r'\b[a-zA-Z]+\b', sent.lower())

score = 0
for word in words:
if word in freq_word:
score += freq_word[word]
sent_strength[sent] = score

return sent_strength


Expand All @@ -115,29 +161,47 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5):
infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents]
infos = sorted(infos, key=attrgetter("order"))
logger.info(f"Extracted {len(infos)} sentences ...")
return tuple(i.sentence.text for i in infos)
return tuple(i.sentence for i in infos)


def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str:
"""Generate extractive summary using spacy pipeline"""
if not nlp:
return extractive_summary_lsa(doc, n_sents)

doc = nlp(doc)
logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...")
words = get_significant_words_list(doc)
freq_word = get_frequency_words(words)
sent_strength = get_sent_strength(doc, freq_word)

summaries = get_extractive_summary(sent_strength, n_sents=n_sents)
if not summaries:
return extractive_summary_lsa(doc.text, n_sents)

start_sentence = list(doc.sents)[0].text
total_summary = ' '.join(summaries)
if start_sentence in summaries:
return total_summary
return start_sentence + ' ' + total_summary
def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str:
"""Generate extractive summary using the best available method"""
if nlp is not None:
# Use spaCy-based extractive summarization
return extractive_summary_spacy(text, n_sents)
else:
# Fallback to LSA
return extractive_summary_lsa(text, n_sents)


def extractive_summary_spacy(text: str, n_sents: int = 5) -> str:
"""Generate extractive summary using spaCy-based approach"""
try:
# Split text into sentences using spaCy
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]

if len(sentences) <= n_sents:
return ' '.join(sentences)

# Get significant words
words = get_significant_words_list(text)

# Get word frequencies
freq_word = get_frequency_words(words)

# Calculate sentence strengths
sent_strength = get_sent_strength(sentences, freq_word)

# Extract top sentences
summary_sentences = get_extractive_summary(sent_strength, n_sents)

return ' '.join(summary_sentences)

except Exception as e:
logger.error(f"Error in spaCy summarization: {e}")
# Fallback to LSA
return extractive_summary_lsa(text, n_sents)


def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:
Expand All @@ -155,48 +219,35 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:

def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]:
"""Split document into chunks with maximum token length"""
if not nlp:
# Simple sentence splitting fallback
sentences = document.split('.')
chunks = []
current_chunk = ""

for sentence in sentences:
test_chunk = current_chunk + sentence + "."
tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']

if len(tokens) <= token_max_length:
current_chunk = test_chunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence + "."

if current_chunk:
chunks.append(current_chunk)

return chunks
chunks = []
current_chunk = ""

sents = []
length = 0
doc = nlp(document)
s = ''
for sentence in doc.sents:
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids']
length += len(tokens_in_sentence)
if length <= token_max_length:
s += sentence.text
if nlp is not None:
# Use spaCy for better sentence splitting
doc = nlp(document)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
else:
# Fallback to simple sentence splitting
sentences = [s.strip() for s in document.split('.') if s.strip()]

for sentence in sentences:
if not sentence:
continue
test_chunk = current_chunk + " " + sentence if current_chunk else sentence
tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']

if len(tokens) <= token_max_length:
current_chunk = test_chunk
else:
sents.append(s)
s = sentence.text
length = len(tokens_in_sentence)
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence

# Append last string
if s:
sents.append(s)
if current_chunk:
chunks.append(current_chunk)

logger.info(f'Returning {len(sents)} number of chunk strings')
return sents
logger.info(f'Returning {len(chunks)} number of chunk strings')
return chunks


def generate_summary_from_text(text: str) -> str:
Expand All @@ -205,11 +256,15 @@ def generate_summary_from_text(text: str) -> str:
logger.info(f"Generating summary from text of length: {len(text)}")

try:
total_summary = extractive_summary_lsa(text, n_sents=5)
total_summary = extractive_summary_pipeline(text, n_sents=5)
except Exception as e:
logger.error(f"Error generating summary: {e}")
# Fallback to simple truncation
sentences = text.split('.')[:3]
if nlp is not None:
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents][:3]
else:
sentences = text.split('.')[:3]
total_summary = '. '.join(sentences) + '.'

logger.info(f"*** ELAPSED CREATE SUMMARY FROM TEXT: {time.time() - start} s")
Expand Down
6 changes: 4 additions & 2 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ pytest-asyncio==0.21.1
pytest-cov==4.1.0
torch==2.5.1
transformers==4.40.0
spacy==3.7.6
spacy==3.8.7
newspaper3k==0.2.8
lxml_html_clean==0.4.2
sumy==0.11.0
python-jose[cryptography]==3.3.0
PyJWT==2.8.0
cryptography==41.0.7
passlib[bcrypt]==1.7.4
python-dotenv==1.0.0
gunicorn==21.2.0
Loading