Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


class Settings(BaseSettings):
database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb"
database_url: str = "sqlite+aiosqlite:///./summarizerdb.db"
secret_key: str = "your-secret-key-here-change-in-production"
algorithm: str = "HS256"
access_token_expire_minutes: int = 30
Expand Down
10 changes: 5 additions & 5 deletions backend/app/core/security.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from typing import Optional
from jose import JWTError, jwt
import jwt
from passlib.context import CryptContext
from app.core.config import settings

Expand All @@ -10,9 +10,9 @@
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
to_encode = data.copy()
if expires_delta:
expire = datetime.utcnow() + expires_delta
expire = datetime.now(timezone.utc) + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=15)
expire = datetime.now(timezone.utc) + timedelta(minutes=15)
to_encode.update({"exp": expire})
encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm)
return encoded_jwt
Expand All @@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]:
if username is None:
return None
return username
except JWTError:
except jwt.InvalidTokenError:
return None
134 changes: 53 additions & 81 deletions backend/app/core/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from collections import Counter, namedtuple
from operator import attrgetter

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import nltk
from newspaper import Article
Expand Down Expand Up @@ -48,11 +48,11 @@
summarizer.stop_words = get_stop_words(LANGUAGE)

# Load spacy model
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
nlp = None
# try:
# nlp = spacy.load("en_core_web_sm")
# except OSError:
# logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm")
nlp = None

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",))

Expand All @@ -70,16 +70,19 @@ def download_text(url: str) -> Article:
return article


def get_significant_words_list(doc) -> List[str]:
"""Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation"""
def get_significant_words_list(text: str) -> List[str]:
"""Get a list of important words excluding stop words and punctuation"""
# Simplified version without spaCy
import re
words = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
if (token.text in stopwords or token.text in punctuation):
continue
if (token.pos_ in pos_tag):
words.append(token.text)
# Basic stop words
stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}

# Simple word extraction
words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower())
for word in words_raw:
if word not in stopwords and word not in punctuation and len(word) > 2:
words.append(word)
return words


Expand All @@ -94,16 +97,18 @@ def get_frequency_words(words: List[str]) -> Counter:
return freq_word


def get_sent_strength(doc, freq_word: Counter) -> Dict:
def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict:
"""Get sentence importance scores based on word frequencies"""
sent_strength = {}
for sent in doc.sents:
for word in sent:
if word.text in freq_word.keys():
if sent in sent_strength.keys():
sent_strength[sent] += freq_word[word.text]
else:
sent_strength[sent] = freq_word[word.text]
import re

for sent in sentences:
words = re.findall(r'\b[a-zA-Z]+\b', sent.lower())
score = 0
for word in words:
if word in freq_word:
score += freq_word[word]
sent_strength[sent] = score
return sent_strength


Expand All @@ -115,29 +120,13 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5):
infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents]
infos = sorted(infos, key=attrgetter("order"))
logger.info(f"Extracted {len(infos)} sentences ...")
return tuple(i.sentence.text for i in infos)
return tuple(i.sentence for i in infos)


def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str:
"""Generate extractive summary using spacy pipeline"""
if not nlp:
return extractive_summary_lsa(doc, n_sents)

doc = nlp(doc)
logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...")
words = get_significant_words_list(doc)
freq_word = get_frequency_words(words)
sent_strength = get_sent_strength(doc, freq_word)

summaries = get_extractive_summary(sent_strength, n_sents=n_sents)
if not summaries:
return extractive_summary_lsa(doc.text, n_sents)

start_sentence = list(doc.sents)[0].text
total_summary = ' '.join(summaries)
if start_sentence in summaries:
return total_summary
return start_sentence + ' ' + total_summary
def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str:
"""Generate extractive summary using simplified pipeline"""
# Always use LSA for now since spaCy is disabled
return extractive_summary_lsa(text, n_sents)


def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:
Expand All @@ -155,45 +144,28 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str:

def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]:
"""Split document into chunks with maximum token length"""
if not nlp:
# Simple sentence splitting fallback
sentences = document.split('.')
chunks = []
current_chunk = ""

for sentence in sentences:
test_chunk = current_chunk + sentence + "."
tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']

if len(tokens) <= token_max_length:
current_chunk = test_chunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence + "."

if current_chunk:
chunks.append(current_chunk)

return chunks
# Simple sentence splitting fallback
sentences = document.split('.')
chunks = []
current_chunk = ""

sents = []
length = 0
doc = nlp(document)
s = ''
for sentence in doc.sents:
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids']
length += len(tokens_in_sentence)
if length <= token_max_length:
s += sentence.text
for sentence in sentences:
if not sentence.strip():
continue
test_chunk = current_chunk + sentence + "."
tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids']

if len(tokens) <= token_max_length:
current_chunk = test_chunk
else:
sents.append(s)
s = sentence.text
length = len(tokens_in_sentence)
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence + "."

if current_chunk:
chunks.append(current_chunk)

# Append last string
if s:
sents.append(s)
sents = chunks

logger.info(f'Returning {len(sents)} number of chunk strings')
return sents
Expand Down
6 changes: 4 additions & 2 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ pytest-asyncio==0.21.1
pytest-cov==4.1.0
torch==2.5.1
transformers==4.40.0
spacy==3.7.6
spacy==3.8.2
newspaper3k==0.2.8
lxml_html_clean==0.4.2
sumy==0.11.0
python-jose[cryptography]==3.3.0
PyJWT==2.8.0
cryptography==41.0.7
passlib[bcrypt]==1.7.4
python-dotenv==1.0.0
gunicorn==21.2.0
Loading