Skip to content
15 changes: 8 additions & 7 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ class Settings(BaseSettings):
openai_timeout_seconds: float = 10.0
chat_answer_model: str = "gpt-4o-mini"
notice_summary_model: str = "gpt-4o-mini"

chat_retrieval_version_single: str = "hybrid-dormitory-search-v1"
chat_retrieval_version_grouped: str = "hybrid-dormitory-search-unspecified-v1"
chat_retrieval_method_single: str = "hybrid_dormitory_top_k"
chat_retrieval_method_grouped: str = "hybrid_unspecified_dormitory_top_k"

chat_prompt_version_single: str = "chat-answer-citation-v2"
chat_prompt_version_grouped: str = "chat-answer-grouped-citation-v2"
chat_retrieval_version_single: str = "dormitory-search-v1"
chat_retrieval_version_grouped: str = "dormitory-search-unspecified-v1"
chat_retrieval_method_single: str = "vector_dormitory_top_k"
chat_retrieval_method_grouped: str = "vector_unspecified_dormitory_top_k"
chat_no_answer_message: str = "관련 정보를 찾을 수 없습니다."
chat_invalid_question_message: str = "기숙사 관련 질문을 입력해주세요."
chat_single_dormitory_top_k: int = 3
Expand All @@ -33,8 +35,8 @@ class Settings(BaseSettings):
chat_session_timeout_minutes: int = 30

chat_fallback_top_k: int = 5
chat_retrieval_method_fallback: str = "vector_all_dormitories_fallback"
chat_retrieval_version_fallback: str = "dormitory-search-fallback-v1"
chat_retrieval_method_fallback: str = "hybrid_all_dormitories_fallback"
chat_retrieval_version_fallback: str = "hybrid-dormitory-search-fallback-v1"

chat_fallback_similarity_threshold: float = 0.35

Expand All @@ -56,4 +58,3 @@ def get_settings() -> Settings:


Settings.model_config = SettingsConfigDict(extra="ignore")

268 changes: 267 additions & 1 deletion app/repositories/regulation_chunk_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,4 +261,270 @@ def search_similar_chunks_all_dormitories(
"similarity": float(row.similarity),
}
for row in result
]
]


def search_hybrid_chunks(
db: Session,
query_text: str,
query_embedding: list[float],
dormitory: str,
top_k: int = 3,
candidate_k: int = 20,
keyword_weight: float = 0.3,
):
"""단일 생활관과 공통 문서를 대상으로 하이브리드 검색합니다."""

return _search_hybrid_chunks(
db=db,
query_text=query_text,
query_embedding=query_embedding,
top_k=top_k,
candidate_k=candidate_k,
keyword_weight=keyword_weight,
filter_sql="(rd.dormitory = :dormitory OR rd.dormitory IS NULL)",
params={"dormitory": dormitory},
)


def search_hybrid_chunks_for_dormitories(
db: Session,
query_text: str,
query_embedding: list[float],
dormitories: list[str],
top_k: int = 3,
candidate_k: int = 20,
keyword_weight: float = 0.3,
):
"""여러 생활관과 공통 문서를 대상으로 하이브리드 검색합니다."""

return _search_hybrid_chunks(
db=db,
query_text=query_text,
query_embedding=query_embedding,
top_k=top_k,
candidate_k=candidate_k,
keyword_weight=keyword_weight,
filter_sql="(rd.dormitory = ANY(:dormitories) OR rd.dormitory IS NULL)",
params={"dormitories": dormitories},
)


def search_hybrid_chunks_all_dormitories(
db: Session,
query_text: str,
query_embedding: list[float],
top_k: int = 5,
candidate_k: int = 30,
keyword_weight: float = 0.3,
):
"""생활관 필터 없이 전체 활성 regulation_chunk를 대상으로 하이브리드 검색합니다."""

return _search_hybrid_chunks(
db=db,
query_text=query_text,
query_embedding=query_embedding,
top_k=top_k,
candidate_k=candidate_k,
keyword_weight=keyword_weight,
filter_sql="TRUE",
params={},
)


def _search_hybrid_chunks(
db: Session,
*,
query_text: str,
query_embedding: list[float],
top_k: int,
candidate_k: int,
keyword_weight: float,
filter_sql: str,
params: dict,
):
"""
벡터 유사도와 키워드 점수를 가중합해 검색합니다.

반환값의 similarity는 최종 결합 점수이며, 벡터 단독 점수는 vector_similarity에 남깁니다.
"""

embedding_str = "[" + ",".join(map(str, query_embedding)) + "]"

sql = text(
f"""
WITH vector_search AS (
SELECT
rc.regulation_chunk_id,
rd.document_id,
rd.document_version,
rc.chunk_id,
COALESCE(rc.chunk_text, rd.content, '') AS content,
rd.source,
rd.source_url,
rd.dormitory,
1 - (rc.embedding <=> CAST(:embedding AS vector)) AS vector_similarity,
NULL::float AS keyword_score,
ROW_NUMBER() OVER (
ORDER BY rc.embedding <=> CAST(:embedding AS vector)
) AS vector_rank,
NULL::bigint AS keyword_rank
FROM regulation_chunk rc
JOIN regulation_document rd
ON rd.regulation_document_id = rc.regulation_document_id
WHERE {filter_sql}
AND rc.is_active = TRUE
AND rd.is_active = TRUE
AND rc.embedding IS NOT NULL
ORDER BY rc.embedding <=> CAST(:embedding AS vector)
LIMIT :candidate_k
),

keyword_search AS (
SELECT
rc.regulation_chunk_id,
rd.document_id,
rd.document_version,
rc.chunk_id,
COALESCE(rc.chunk_text, rd.content, '') AS content,
rd.source,
rd.source_url,
rd.dormitory,
1 - (rc.embedding <=> CAST(:embedding AS vector)) AS vector_similarity,
ts_rank_cd(
to_tsvector(
'simple',
COALESCE(rc.chunk_text, '') || ' ' ||
COALESCE(rd.content, '') || ' ' ||
COALESCE(rc.keywords::text, '')
),
websearch_to_tsquery('simple', :query_text)
Comment on lines +395 to +401
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

SQL 쿼리 내에서 to_tsvector를 검색 시마다 매번 호출하는 방식은 성능상 큰 병목이 될 수 있습니다. 특히 현재 쿼리에서는 SELECT, ORDER BY, WHERE 절에서 반복적으로 계산되고 있어 데이터량이 많아질 경우 인덱스를 활용하지 못하고 검색 속도가 급격히 저하됩니다.

개선 제안:

  1. regulation_chunk 테이블에 tsvector 타입의 컬럼을 추가하고, 데이터 삽입/수정 시 미리 계산하여 저장하십시오.
  2. 해당 컬럼에 GIN 인덱스를 생성하여 검색 성능을 최적화하십시오.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

이 부분은 수정해 주신건가요?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

새 작업이 필요하다 생각해 새 이슈로 팠습니다! 이 브랜치 머지 후 작업하려구요

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

알겠습니다!

) AS keyword_score,
NULL::bigint AS vector_rank,
ROW_NUMBER() OVER (
ORDER BY ts_rank_cd(
to_tsvector(
'simple',
COALESCE(rc.chunk_text, '') || ' ' ||
COALESCE(rd.content, '') || ' ' ||
COALESCE(rc.keywords::text, '')
),
websearch_to_tsquery('simple', :query_text)
) DESC
) AS keyword_rank
FROM regulation_chunk rc
JOIN regulation_document rd
ON rd.regulation_document_id = rc.regulation_document_id
WHERE {filter_sql}
AND rc.is_active = TRUE
AND rd.is_active = TRUE
AND rc.embedding IS NOT NULL
AND to_tsvector(
'simple',
COALESCE(rc.chunk_text, '') || ' ' ||
COALESCE(rd.content, '') || ' ' ||
COALESCE(rc.keywords::text, '')
) @@ websearch_to_tsquery('simple', :query_text)
ORDER BY keyword_score DESC
LIMIT :candidate_k
),

combined AS (
SELECT * FROM vector_search
UNION ALL
SELECT * FROM keyword_search
),

dedup AS (
SELECT
regulation_chunk_id,
MAX(document_id) AS document_id,
MAX(document_version) AS document_version,
MAX(chunk_id) AS chunk_id,
MAX(content) AS content,
MAX(source) AS source,
MAX(source_url) AS source_url,
MAX(dormitory) AS dormitory,
MAX(vector_similarity) AS vector_similarity,
MAX(keyword_score) AS keyword_score,
MIN(vector_rank) AS vector_rank,
MIN(keyword_rank) AS keyword_rank
FROM combined
GROUP BY regulation_chunk_id
),

scored AS (
SELECT
*,
LEAST(1, GREATEST(0, COALESCE(vector_similarity, 0))) AS vector_score,
COALESCE(keyword_score / NULLIF(MAX(keyword_score) OVER (), 0), 0) AS normalized_keyword_score,
(
LEAST(1, GREATEST(0, COALESCE(vector_similarity, 0))) +
(
:keyword_weight *
COALESCE(keyword_score / NULLIF(MAX(keyword_score) OVER (), 0), 0) *
(1 - LEAST(1, GREATEST(0, COALESCE(vector_similarity, 0))))
)
) AS hybrid_score
FROM dedup
)

SELECT
regulation_chunk_id,
document_id,
document_version,
chunk_id,
content,
source,
source_url,
dormitory,
vector_similarity,
vector_score,
keyword_score,
normalized_keyword_score,
vector_rank,
keyword_rank,
hybrid_score
FROM scored
ORDER BY hybrid_score DESC
LIMIT :top_k
"""
)

result = db.execute(
sql,
{
"embedding": embedding_str,
"query_text": query_text.strip(),
"top_k": top_k,
"candidate_k": candidate_k,
"keyword_weight": keyword_weight,
**params,
},
).mappings().all()

return [
{
"regulation_chunk_id": row.regulation_chunk_id,
"document_id": row.document_id,
"document_version": row.document_version,
"chunk_id": row.chunk_id,
"content": row.content,
"source": row.source,
"source_url": row.source_url,
"retrieval_group": row.dormitory,
"similarity": float(row.hybrid_score),
"vector_similarity": float(row.vector_similarity) if row.vector_similarity is not None else None,
"vector_score": float(row.vector_score) if row.vector_score is not None else None,
"keyword_score": float(row.keyword_score) if row.keyword_score is not None else None,
"normalized_keyword_score": (
float(row.normalized_keyword_score)
if row.normalized_keyword_score is not None
else None
),
"vector_rank": int(row.vector_rank) if row.vector_rank is not None else None,
"keyword_rank": int(row.keyword_rank) if row.keyword_rank is not None else None,
"hybrid_score": float(row.hybrid_score),
}
for row in result
]
Loading
Loading