From 947ff822b46888eba74ad3d6f98962363ea3c329 Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 17:56:19 +0900 Subject: [PATCH 1/9] =?UTF-8?q?:zap:=20perf:=20=EC=9D=91=EB=8B=B5=20?= =?UTF-8?q?=EC=86=8D=EB=8F=84=20=EA=B0=9C=EC=84=A0=EC=9D=84=20=EC=9C=84?= =?UTF-8?q?=ED=95=9C=20Dockerfile=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9ccf8df..0d804d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-slim +FROM python:3.11 WORKDIR /app @@ -8,4 +8,10 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . RUN python -m leftovers.domain.recommend.service.train + +# 스레드 수 조정 +ENV OMP_NUM_THREADS=4 +ENV OPENBLAS_NUM_THREADS=4 +ENV MKL_NUM_THREADS=4 + CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] From 4b20cc67a7ba345416b1e30bd0b220e67abb7449 Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 18:06:14 +0900 Subject: [PATCH 2/9] =?UTF-8?q?:zap:=20perf:=20=EB=B2=A1=ED=84=B0=EA=B0=80?= =?UTF-8?q?=20L2=20=EC=A0=95=EA=B7=9C=ED=99=94=EB=90=98=EC=96=B4=20?= =?UTF-8?q?=EC=9E=88=EC=96=B4=20linear=5Fkernel=EB=A1=9C=20=EB=B3=80?= =?UTF-8?q?=EA=B2=BD=20(cosine=5Fsimilarity=20=EB=8C=80=EB=B9=84=20?= =?UTF-8?q?=EC=97=B0=EC=82=B0=20=EB=B9=84=EC=9A=A9=20=EC=A0=88=EA=B0=90)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/service/matcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/leftovers/domain/recommend/service/matcher.py b/leftovers/domain/recommend/service/matcher.py index eef839c..a842827 100644 --- a/leftovers/domain/recommend/service/matcher.py +++ b/leftovers/domain/recommend/service/matcher.py @@ -1,5 +1,6 @@ import numpy as np from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics.pairwise import linear_kernel from leftovers.domain.recommend.service import loader # 메뉴 이름이 유사한 것 찾기 @@ -8,7 +9,7 @@ def match_top1(query: str): return (-1, "", 0.0) # 매칭 실패 query_vector = loader._NAME_VEC.transform([str(query)]) # 문자열을 벡터로 변환 - similarity_list = cosine_similarity(query_vector, loader._NAME_MAT).ravel() # 코사인 유사도(두 벡터 간의 유사도) 측정 + similarity_list = linear_kernel(query_vector, loader._NAME_MAT).ravel() # 두 벡터 간의 유사도 측정 idx = int(np.argmax(similarity_list)) # 가장 높은 유사도를 가진 인덱스 similarity = float(similarity_list[idx]) From f437c0c000db9b7f0d720a57af71dd6b0ee3d376 Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 18:22:19 +0900 Subject: [PATCH 3/9] =?UTF-8?q?:zap:=20perf:=20=EB=A9=94=EB=89=B4=20?= =?UTF-8?q?=EB=B2=A1=ED=84=B0=20=EC=A1=B0=ED=9A=8C=20=EC=8B=9C=20transform?= =?UTF-8?q?=20=EB=8C=80=EC=8B=A0=20dict=20lookup=20=ED=99=9C=EC=9A=A9?= =?UTF-8?q?=ED=95=98=EC=97=AC=20=EC=9D=91=EB=8B=B5=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/service/loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/leftovers/domain/recommend/service/loader.py b/leftovers/domain/recommend/service/loader.py index 4fa7488..21613d2 100644 --- a/leftovers/domain/recommend/service/loader.py +++ b/leftovers/domain/recommend/service/loader.py @@ -13,16 +13,16 @@ _DB_ROWS: List[dict] = [] # 음식 데이터 _NAME_LIST: List[str] = [] # 음식 이름 리스트 _NAME_VEC = None # 음식 이름 벡터화(음식 이름 문자열을 숫자 벡터로 변환) -_NAME_MAT = None # 벡터화 결과 저장소(매트릭스) +_NAME_MAT = None # 벡터화 결과 저장소(매트릭스) : 유사도 계산 전체 돌릴 때 사용 +_NAME_LOOKUP = {} # 벡터화 결과 딕셔너리 : 메뉴가 DB에 있는 경우, 그 벡터만 필요할 때 사용 _IMPUTER = None # 결측치를 적절한 값으로 채워주는 보간기 _SCALER = None # 값들의 크기를 일정한 값으로 맞춰주는 도구 _MODELS = {} # 컨셉별 ML 모델 _CALIB = None # 점수 보정기 - # 캐시에 DB와 모델 전부 로딩 def load_all(): - global _DB_ROWS, _NAME_LIST, _NAME_VEC, _NAME_MAT, _IMPUTER, _SCALER, _MODELS, _CALIB + global _DB_ROWS, _NAME_LIST, _NAME_VEC, _NAME_MAT, _IMPUTER, _SCALER, _MODELS, _CALIB, _NAME_LOOKUP _DB_ROWS = load_kfda_excels(FOOD_FILES, sheet_name=None) @@ -30,6 +30,7 @@ def load_all(): _NAME_MAT = joblib.load(f"{MODEL_DIR}/name_matrix.joblib") _NAME_LIST = joblib.load(f"{MODEL_DIR}/name_list.joblib") + _NAME_LOOKUP = {name: vec for name, vec in zip(_NAME_LIST, _NAME_MAT)} _IMPUTER = joblib.load(f"{MODEL_DIR}/nutrition_imputer.joblib") _SCALER = joblib.load(f"{MODEL_DIR}/nutrition_scaler.joblib") From 4dab0c863336ead783295a07a21ae4b932f67334 Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 18:23:41 +0900 Subject: [PATCH 4/9] =?UTF-8?q?:zap:=20perf:=20=EC=9E=85=EB=A0=A5=EC=9D=B4?= =?UTF-8?q?=20DB=EC=97=90=20=EC=9E=88=EB=8A=94=20=EA=B2=BD=EC=9A=B0=20tran?= =?UTF-8?q?sform=20=EC=8A=A4=ED=82=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/service/matcher.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/leftovers/domain/recommend/service/matcher.py b/leftovers/domain/recommend/service/matcher.py index a842827..303a238 100644 --- a/leftovers/domain/recommend/service/matcher.py +++ b/leftovers/domain/recommend/service/matcher.py @@ -8,6 +8,11 @@ def match_top1(query: str): if not loader._NAME_LIST: # 로딩된 메뉴가 없을 경우 return (-1, "", 0.0) # 매칭 실패 + if query in loader._NAME_LOOKUP: # 입력이 DB에 있는 경우 transform 스킵 + query_vector = loader._NAME_LOOKUP[query] + else: + query_vector = loader._NAME_VEC.transform([str(query)]) + query_vector = loader._NAME_VEC.transform([str(query)]) # 문자열을 벡터로 변환 similarity_list = linear_kernel(query_vector, loader._NAME_MAT).ravel() # 두 벡터 간의 유사도 측정 idx = int(np.argmax(similarity_list)) # 가장 높은 유사도를 가진 인덱스 From ae81e4aa185a9c060bab91c9f76d947051af874a Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 19:05:26 +0900 Subject: [PATCH 5/9] =?UTF-8?q?:zap:=20perf:=20=EB=B0=B0=EC=B9=98=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=EB=A1=9C=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/recommend/api/recommend_api.py | 3 +- .../domain/recommend/service/evaluator.py | 79 +++++++++++-------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/leftovers/domain/recommend/api/recommend_api.py b/leftovers/domain/recommend/api/recommend_api.py index 1e3d82b..3f47e20 100644 --- a/leftovers/domain/recommend/api/recommend_api.py +++ b/leftovers/domain/recommend/api/recommend_api.py @@ -13,8 +13,7 @@ def recommend(req: RecommendReq): if req.concept not in loader.CONCEPTS: # 컨셉명이 올바르지 않으면 400 에러 return fail(400, {"message": f"알 수 없는 컨셉: {req.concept}"}).model_dump() - # 요청 메뉴를 돌면서 evaluate_item 호출 - items = [evaluator.evaluate_item(req.concept, a) for a in req.items] + items = evaluator.evaluate_items(req.concept, req.items) ranked = [r for r in items if r.matched_name] # 이름이 매칭되지 않으면 제외 ranked.sort(key=lambda r: (r.suitability, r.similarity), reverse=True) # 적합도와 유사도가 높은 순으로 정렬 topn = ranked[: max(1, int(req.count))] # 요청한 개수만큼만 반환 diff --git a/leftovers/domain/recommend/service/evaluator.py b/leftovers/domain/recommend/service/evaluator.py index b580df9..53bd7ec 100644 --- a/leftovers/domain/recommend/service/evaluator.py +++ b/leftovers/domain/recommend/service/evaluator.py @@ -17,41 +17,54 @@ def to_feat(n: dict) -> np.ndarray: return np.array([kcal, protein, fat, carbs, sugar, fiber, sodium, sat_fat, netcarb], dtype=float) # 데이터에서 유사한 메뉴 찾아 점수를 계산하여 반환 -def evaluate_item(concept: str, menu: str): - idx, b_name, sim = matcher.match_top1(menu) # 입력 메뉴명과 가장 유사한 메뉴 찾기 +def evaluate_items(concept: str, menus: list[str]) -> list[MatchItem]: + matched = [] + features = [] - if idx < 0: - return MatchItem(input_menu=menu, note="매칭 실패") + for menu in menus: + idx, b_name, sim = matcher.match_top1(menu) # 메뉴 매칭 + if idx < 0: + matched.append(MatchItem(input_menu=menu, note="매칭 실패")) + features.append(None) + continue + + b_row = dict(loader._DB_ROWS[idx]) # 음식 데이터에서 해당 행을 딕셔너리 형태로 가져옴 + b_row["name"] = b_name - b_row = dict(loader._DB_ROWS[idx]) # 음식 데이터에서 해당 행을 딕셔너리 형태로 가져옴 - b_row["name"] = b_name + x_num = to_feat(b_row) # 피쳐 추출 + matched.append((menu, b_name, sim, b_row)) # 후처리용 + features.append(x_num) + + valid_idx = [i for i, f in enumerate(features) if f is not None] # 매칭 실패 제거하고 batch 변환 + if not valid_idx: + return matched # 전부 실패면 그대로 반환 - x_num = to_feat(b_row).reshape(1, -1) # 영양성분을 숫자 벡터로 변환 - x_imp = loader._IMPUTER.transform(x_num) # 결측치 보간 - x_scaled = loader._SCALER.transform(x_imp) # 모델 학습 범위에 맞게 정규화 + X = np.stack([features[i] for i in valid_idx]) # (N, d) 행렬 + X = loader._IMPUTER.transform(X) # 결측치 보간 + X = loader._SCALER.transform(X) # 모델 학습 범위에 맞게 정규화 model = loader._MODELS[concept] - pred = float(model.predict(x_scaled)[0]) # 모델이 예측한 적합도 - rule = float(compute_score(concept, b_row, loader._CALIB or None)) # 규칙 기반 점수 - fused = 0.3 * pred + 0.7 * rule - score_int = int(round(fused)) # 최종 점수 - - def nz(v): - try: - x = float(v) # NaN, inf 같은 값 들어오면 0.0 보정 - return 0.0 if not np.isfinite(x) else x - except Exception: - return 0.0 - - return MatchItem( - input_menu=menu, - matched_name=b_name, - similarity=round(sim, 3), - suitability=score_int - # detail=NutritionDetail( # 개발 시에만 반환 - # kcal=nz(b_row.get("kcal")), protein=nz(b_row.get("protein")), fat=nz(b_row.get("fat")), - # carbs=nz(b_row.get("carbs")), sugar=nz(b_row.get("sugar")), fiber=nz(b_row.get("fiber")), - # sodium=nz(b_row.get("sodium")), sat_fat=nz(b_row.get("sat_fat", b_row.get("fat", 0.0))) - # ), - # note=None - ) \ No newline at end of file + preds = model.predict(X) # 모델 배치 예측 + + results = [] + pred_i = 0 + for m in matched: + if isinstance(m, MatchItem): # 매칭 실패 + results.append(m) + else: + menu, b_name, sim, b_row = m + pred = float(preds[pred_i]) + rule = float(compute_score(concept, b_row, loader._CALIB or None)) + fused = 0.3 * pred + 0.7 * rule + score_int = int(round(fused)) + pred_i += 1 + + results.append( + MatchItem( + input_menu=menu, + matched_name=b_name, + similarity=round(sim, 3), + suitability=score_int, + ) + ) + return results From f5e97f1093a6b39d1ec9d5dedef218fba6b8475c Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 20:07:32 +0900 Subject: [PATCH 6/9] =?UTF-8?q?:zap:=20perf:=20dict=20->=20numpy=20?= =?UTF-8?q?=EB=B3=80=ED=99=98=EC=9D=84=20=EC=84=9C=EB=B2=84=20=EC=8B=9C?= =?UTF-8?q?=EC=9E=91=20=EC=8B=9C=20=EC=A0=84=EC=B2=98=EB=A6=AC=ED=95=B4?= =?UTF-8?q?=EC=84=9C=20=EC=BA=90=EC=8B=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 6148 -> 6148 bytes .../domain/recommend/service/evaluator.py | 4 +++- leftovers/domain/recommend/service/loader.py | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.DS_Store b/.DS_Store index a2e5ec1a10bd3de3d17cae0d8c6f8f8e3c4e4433..b2ccbde335f249f49704a1a864586cf876976260 100644 GIT binary patch delta 156 zcmZoMXffEJ%EBzkTRmBWC5myu-6Pw(? l#>4t;*5P=Q$rIUB;Veo1f((RqZsvxKi7Jen**X650{}yFF(?24 diff --git a/leftovers/domain/recommend/service/evaluator.py b/leftovers/domain/recommend/service/evaluator.py index 53bd7ec..b86a640 100644 --- a/leftovers/domain/recommend/service/evaluator.py +++ b/leftovers/domain/recommend/service/evaluator.py @@ -36,10 +36,12 @@ def evaluate_items(concept: str, menus: list[str]) -> list[MatchItem]: features.append(x_num) valid_idx = [i for i, f in enumerate(features) if f is not None] # 매칭 실패 제거하고 batch 변환 + if not valid_idx: return matched # 전부 실패면 그대로 반환 + + X = loader._DB_FEATS[[m[-1] for m in matched if not isinstance(m, MatchItem)]] # dict -> numpy 변환 대신 캐시된 _DB_FEATS 사용 - X = np.stack([features[i] for i in valid_idx]) # (N, d) 행렬 X = loader._IMPUTER.transform(X) # 결측치 보간 X = loader._SCALER.transform(X) # 모델 학습 범위에 맞게 정규화 diff --git a/leftovers/domain/recommend/service/loader.py b/leftovers/domain/recommend/service/loader.py index 21613d2..2057f63 100644 --- a/leftovers/domain/recommend/service/loader.py +++ b/leftovers/domain/recommend/service/loader.py @@ -1,7 +1,9 @@ from typing import List import joblib +import numpy as np from leftovers.domain.recommend.service.food_kfda_loader import load_kfda_excels +from leftovers.domain.recommend.service.evaluator import to_feat FOOD_FILES = ["leftovers/domain/recommend/data/foodData1.xlsx", "leftovers/domain/recommend/data/foodData2.xlsx"] MODEL_DIR = "leftovers/domain/recommend/model_store" @@ -11,6 +13,7 @@ # 캐시 이용 -> 서버 시작 시, 메모리에 로딩해두고 API 요청마다 바로 쓰게 _DB_ROWS: List[dict] = [] # 음식 데이터 +_DB_FEATS = None # numpy 캐시 _NAME_LIST: List[str] = [] # 음식 이름 리스트 _NAME_VEC = None # 음식 이름 벡터화(음식 이름 문자열을 숫자 벡터로 변환) _NAME_MAT = None # 벡터화 결과 저장소(매트릭스) : 유사도 계산 전체 돌릴 때 사용 @@ -22,22 +25,29 @@ # 캐시에 DB와 모델 전부 로딩 def load_all(): - global _DB_ROWS, _NAME_LIST, _NAME_VEC, _NAME_MAT, _IMPUTER, _SCALER, _MODELS, _CALIB, _NAME_LOOKUP + global _DB_ROWS, _DB_FEATS + global _NAME_LIST, _NAME_VEC, _NAME_MAT, _NAME_LOOKUP + global _IMPUTER, _SCALER, _MODELS, _CALIB _DB_ROWS = load_kfda_excels(FOOD_FILES, sheet_name=None) + feats = [to_feat(row) for row in _DB_ROWS] # dict -> numpy 변환을 미리해두기 + _DB_FEATS = np.vstack(feats).astype(np.float32) # float32로 메모리 최적화 + + # 이름 벡터 관련 _NAME_VEC = joblib.load(f"{MODEL_DIR}/name_vectorizer.joblib") _NAME_MAT = joblib.load(f"{MODEL_DIR}/name_matrix.joblib") _NAME_LIST = joblib.load(f"{MODEL_DIR}/name_list.joblib") - _NAME_LOOKUP = {name: vec for name, vec in zip(_NAME_LIST, _NAME_MAT)} + # 영양성분 전처리기 _IMPUTER = joblib.load(f"{MODEL_DIR}/nutrition_imputer.joblib") _SCALER = joblib.load(f"{MODEL_DIR}/nutrition_scaler.joblib") + # ML 모델 _MODELS = {c: joblib.load(f"{MODEL_DIR}/concept_model_{c}.joblib") for c in CONCEPTS} try: - _CALIB = joblib.load(f"{MODEL_DIR}/calibration.joblib") + _CALIB = joblib.load(f"{MODEL_DIR}/calibration.joblib") # calibration 로딩 except Exception: _CALIB = None From b79493efede0f1f3f979e0a5b263906217b7df8a Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 20:10:25 +0900 Subject: [PATCH 7/9] =?UTF-8?q?:recycle:=20refactor:=20dict=20->=20numpy?= =?UTF-8?q?=20=EB=B3=80=ED=99=98=20=EC=BA=90=EC=8B=9C=20=EC=82=AC=EC=9A=A9?= =?UTF-8?q?=EC=9C=BC=EC=9C=BC=EB=A1=9C=20=EC=9D=B8=ED=95=9C=20=EC=BD=94?= =?UTF-8?q?=EB=93=9C=20=EC=88=98=EC=A0=95=EC=82=AC=ED=95=AD=20=EB=B0=98?= =?UTF-8?q?=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/service/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/leftovers/domain/recommend/service/evaluator.py b/leftovers/domain/recommend/service/evaluator.py index b86a640..ba05226 100644 --- a/leftovers/domain/recommend/service/evaluator.py +++ b/leftovers/domain/recommend/service/evaluator.py @@ -32,7 +32,7 @@ def evaluate_items(concept: str, menus: list[str]) -> list[MatchItem]: b_row["name"] = b_name x_num = to_feat(b_row) # 피쳐 추출 - matched.append((menu, b_name, sim, b_row)) # 후처리용 + matched.append((menu, b_name, sim, b_row, idx)) # 후처리용 features.append(x_num) valid_idx = [i for i, f in enumerate(features) if f is not None] # 매칭 실패 제거하고 batch 변환 @@ -54,7 +54,7 @@ def evaluate_items(concept: str, menus: list[str]) -> list[MatchItem]: if isinstance(m, MatchItem): # 매칭 실패 results.append(m) else: - menu, b_name, sim, b_row = m + menu, b_name, sim, b_row, idx = m pred = float(preds[pred_i]) rule = float(compute_score(concept, b_row, loader._CALIB or None)) fused = 0.3 * pred + 0.7 * rule From da0c79d2b6aa2f528b57c2b715d014113cc3333e Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 20:41:11 +0900 Subject: [PATCH 8/9] =?UTF-8?q?:zap:=20perf:=20match=5Ftop1=EC=9D=84=20Fai?= =?UTF-8?q?ss/HNSlib=20=EA=B8=B0=EB=B0=98=20ANN=20=EA=B2=80=EC=83=89?= =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/api/recommend_api.py | 16 ++++++++++++++++ leftovers/domain/recommend/service/loader.py | 11 +++++++++++ leftovers/domain/recommend/service/matcher.py | 15 ++++++--------- requirements.txt | 3 ++- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/leftovers/domain/recommend/api/recommend_api.py b/leftovers/domain/recommend/api/recommend_api.py index 3f47e20..44804e8 100644 --- a/leftovers/domain/recommend/api/recommend_api.py +++ b/leftovers/domain/recommend/api/recommend_api.py @@ -4,19 +4,35 @@ from leftovers.domain.recommend.schemas.recommend_response import RecommendRes from leftovers.domain.recommend.service import evaluator, loader +import time + router = APIRouter(prefix="/menus") @router.post("/recommend", response_model=Envelope[RecommendRes]) def recommend(req: RecommendReq): + start = time.time() if not loader._DB_ROWS: # DB, 모델이 안 불러와졌으면 500 에러 return fail(500, {"message": "DB/모델이 비어있습니다."}).model_dump() if req.concept not in loader.CONCEPTS: # 컨셉명이 올바르지 않으면 400 에러 return fail(400, {"message": f"알 수 없는 컨셉: {req.concept}"}).model_dump() + + after_load = time.time() + # 요청 메뉴를 돌면서 evaluate_items 호출 items = evaluator.evaluate_items(req.concept, req.items) + + after_evaluate = time.time() + ranked = [r for r in items if r.matched_name] # 이름이 매칭되지 않으면 제외 ranked.sort(key=lambda r: (r.suitability, r.similarity), reverse=True) # 적합도와 유사도가 높은 순으로 정렬 topn = ranked[: max(1, int(req.count))] # 요청한 개수만큼만 반환 + after_ranked = time.time() res = RecommendRes(concept=req.concept, count=len(topn), items=topn) + + print(f"[DEBUG] 0. start {(start):.3f}s") + print(f"[DEBUG] 1. Loader check took {(after_load):.3f}s") + print(f"[DEBUG] 2. Evaluation took {(after_evaluate):.3f}s") + print(f"[DEBUG] 3. Ranking took {(after_ranked):.3f}s") + return ok(res) diff --git a/leftovers/domain/recommend/service/loader.py b/leftovers/domain/recommend/service/loader.py index 2057f63..336effd 100644 --- a/leftovers/domain/recommend/service/loader.py +++ b/leftovers/domain/recommend/service/loader.py @@ -1,6 +1,7 @@ from typing import List import joblib import numpy as np +import hnswlib from leftovers.domain.recommend.service.food_kfda_loader import load_kfda_excels from leftovers.domain.recommend.service.evaluator import to_feat @@ -23,6 +24,8 @@ _MODELS = {} # 컨셉별 ML 모델 _CALIB = None # 점수 보정기 +_HNSW_INDEX = None # ANN 인덱스 + # 캐시에 DB와 모델 전부 로딩 def load_all(): global _DB_ROWS, _DB_FEATS @@ -40,6 +43,14 @@ def load_all(): _NAME_LIST = joblib.load(f"{MODEL_DIR}/name_list.joblib") _NAME_LOOKUP = {name: vec for name, vec in zip(_NAME_LIST, _NAME_MAT)} + # HNSW 인덱스 구축 + dim = _NAME_MAT.shape[1] + _HNSW_INDEX = hnswlib.Index(space='cosine', dim=dim) + _HNSW_INDEX.init_index(max_elements=_NAME_MAT.shape[0], + ef_construction=200, M=16) + _HNSW_INDEX.add_items(_NAME_MAT.astype(np.float32)) + _HNSW_INDEX.set_ef(50) + # 영양성분 전처리기 _IMPUTER = joblib.load(f"{MODEL_DIR}/nutrition_imputer.joblib") _SCALER = joblib.load(f"{MODEL_DIR}/nutrition_scaler.joblib") diff --git a/leftovers/domain/recommend/service/matcher.py b/leftovers/domain/recommend/service/matcher.py index 303a238..86ccdeb 100644 --- a/leftovers/domain/recommend/service/matcher.py +++ b/leftovers/domain/recommend/service/matcher.py @@ -1,22 +1,19 @@ import numpy as np -from sklearn.metrics.pairwise import cosine_similarity -from sklearn.metrics.pairwise import linear_kernel from leftovers.domain.recommend.service import loader # 메뉴 이름이 유사한 것 찾기 def match_top1(query: str): - if not loader._NAME_LIST: # 로딩된 메뉴가 없을 경우 + if not loader._NAME_LIST or loader._HNSW_INDEX : # 로딩된 메뉴가 없을 경우, ANN 인덱스가 없을 경우 return (-1, "", 0.0) # 매칭 실패 if query in loader._NAME_LOOKUP: # 입력이 DB에 있는 경우 transform 스킵 - query_vector = loader._NAME_LOOKUP[query] + query_vector = loader._NAME_LOOKUP[query].astype(np.float32).reshape(1, -1) else: - query_vector = loader._NAME_VEC.transform([str(query)]) + query_vector = loader._NAME_VEC.transform([str(query)]).astype(np.float32) - query_vector = loader._NAME_VEC.transform([str(query)]) # 문자열을 벡터로 변환 - similarity_list = linear_kernel(query_vector, loader._NAME_MAT).ravel() # 두 벡터 간의 유사도 측정 - idx = int(np.argmax(similarity_list)) # 가장 높은 유사도를 가진 인덱스 - similarity = float(similarity_list[idx]) + labels, distances = loader._HNSW_INDEX.knn_query(query_vector, k=1) + idx = int(labels[0][0]) + similarity = 1 - float(distances[0][0]) # 가장 높은 유사도를 가진 인덱스 if not np.isfinite(similarity): # NaN이나 inf 나오면 similarity = 0.0 # 0.0.으로 보정 diff --git a/requirements.txt b/requirements.txt index 2a4e33d..816dce2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ python-dotenv==1.0.1 scikit-learn==1.5.1 joblib==1.4.2 scipy==1.13.1 -openai>=1.50.2 \ No newline at end of file +openai>=1.50.2 +hnswlib==0.8.0 \ No newline at end of file From 37864b0337b9ff07d913dac68bd1714dcdec93df Mon Sep 17 00:00:00 2001 From: frombunny Date: Sat, 23 Aug 2025 21:51:02 +0900 Subject: [PATCH 9/9] =?UTF-8?q?:recycle:=20refactor:=20match=5Ftop1?= =?UTF-8?q?=EC=9D=84=20Faiss/HNSlib=20=EA=B8=B0=EB=B0=98=20ANN=20=EA=B2=80?= =?UTF-8?q?=EC=83=89=20=EB=B3=80=EA=B2=BD=EC=9C=BC=EB=A1=9C=20=EC=9D=B8?= =?UTF-8?q?=ED=95=9C=20=EC=BD=94=EB=93=9C=20=EC=88=98=EC=A0=95=20=EC=82=AC?= =?UTF-8?q?=ED=95=AD=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leftovers/domain/recommend/service/loader.py | 23 +++++++++++++++---- leftovers/domain/recommend/service/matcher.py | 8 +++---- main.py | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/leftovers/domain/recommend/service/loader.py b/leftovers/domain/recommend/service/loader.py index 336effd..b042b11 100644 --- a/leftovers/domain/recommend/service/loader.py +++ b/leftovers/domain/recommend/service/loader.py @@ -2,6 +2,7 @@ import joblib import numpy as np import hnswlib +from sklearn.decomposition import TruncatedSVD from leftovers.domain.recommend.service.food_kfda_loader import load_kfda_excels from leftovers.domain.recommend.service.evaluator import to_feat @@ -30,33 +31,45 @@ def load_all(): global _DB_ROWS, _DB_FEATS global _NAME_LIST, _NAME_VEC, _NAME_MAT, _NAME_LOOKUP - global _IMPUTER, _SCALER, _MODELS, _CALIB - + global _IMPUTER, _SCALER, _MODELS, _CALIB, _HNSW_INDEX + _DB_ROWS = load_kfda_excels(FOOD_FILES, sheet_name=None) feats = [to_feat(row) for row in _DB_ROWS] # dict -> numpy 변환을 미리해두기 _DB_FEATS = np.vstack(feats).astype(np.float32) # float32로 메모리 최적화 + print("1차 진입") # 이름 벡터 관련 _NAME_VEC = joblib.load(f"{MODEL_DIR}/name_vectorizer.joblib") _NAME_MAT = joblib.load(f"{MODEL_DIR}/name_matrix.joblib") _NAME_LIST = joblib.load(f"{MODEL_DIR}/name_list.joblib") - _NAME_LOOKUP = {name: vec for name, vec in zip(_NAME_LIST, _NAME_MAT)} + # sparse -> dense float32 변환 + svd = TruncatedSVD(n_components=256, random_state=42) + _NAME_MAT_DENSE = svd.fit_transform(_NAME_MAT).astype(np.float32) + + # dens 기반 이름 벡터 캐시 + _NAME_LOOKUP = {name: vec for name, vec in zip(_NAME_LIST, _NAME_MAT_DENSE)} + + print(f"[DEBUG] load_all: _HNSW_INDEX before = {id(_HNSW_INDEX)}") + # HNSW 인덱스 구축 - dim = _NAME_MAT.shape[1] + dim = _NAME_MAT_DENSE.shape[1] _HNSW_INDEX = hnswlib.Index(space='cosine', dim=dim) _HNSW_INDEX.init_index(max_elements=_NAME_MAT.shape[0], ef_construction=200, M=16) - _HNSW_INDEX.add_items(_NAME_MAT.astype(np.float32)) + _HNSW_INDEX.add_items(_NAME_MAT_DENSE) _HNSW_INDEX.set_ef(50) + print(f"[DEBUG] load_all: _HNSW_INDEX after = {id(_HNSW_INDEX)}") + # 영양성분 전처리기 _IMPUTER = joblib.load(f"{MODEL_DIR}/nutrition_imputer.joblib") _SCALER = joblib.load(f"{MODEL_DIR}/nutrition_scaler.joblib") # ML 모델 _MODELS = {c: joblib.load(f"{MODEL_DIR}/concept_model_{c}.joblib") for c in CONCEPTS} + print("마지막 진입") try: _CALIB = joblib.load(f"{MODEL_DIR}/calibration.joblib") # calibration 로딩 diff --git a/leftovers/domain/recommend/service/matcher.py b/leftovers/domain/recommend/service/matcher.py index 86ccdeb..6ab126a 100644 --- a/leftovers/domain/recommend/service/matcher.py +++ b/leftovers/domain/recommend/service/matcher.py @@ -3,15 +3,15 @@ # 메뉴 이름이 유사한 것 찾기 def match_top1(query: str): - if not loader._NAME_LIST or loader._HNSW_INDEX : # 로딩된 메뉴가 없을 경우, ANN 인덱스가 없을 경우 + if not loader._NAME_LIST or loader._HNSW_INDEX is None : # 로딩된 메뉴가 없을 경우, ANN 인덱스가 없을 경우 return (-1, "", 0.0) # 매칭 실패 - if query in loader._NAME_LOOKUP: # 입력이 DB에 있는 경우 transform 스킵 + if query in loader._NAME_LOOKUP: # 입력이 DB에 존재하면 변환 스킵하고 캐싱된 벡터 사용 query_vector = loader._NAME_LOOKUP[query].astype(np.float32).reshape(1, -1) else: - query_vector = loader._NAME_VEC.transform([str(query)]).astype(np.float32) + query_vector = loader._NAME_VEC.transform([str(query)]).toarray().astype(np.float32) - labels, distances = loader._HNSW_INDEX.knn_query(query_vector, k=1) + labels, distances = loader._HNSW_INDEX.knn_query(query_vector, k=1) # ANN(HNSW) 검색 idx = int(labels[0][0]) similarity = 1 - float(distances[0][0]) # 가장 높은 유사도를 가진 인덱스 diff --git a/main.py b/main.py index 3cee34d..e2b106f 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,7 @@ @asynccontextmanager async def lifespan(app: FastAPI): + print("loop 진입") loop = asyncio.get_event_loop() await loop.run_in_executor(None, loader.load_all) print("모델/DB 로딩 완료")