From cda214cf253426788e7285f7e01abfd0a36986e2 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Tue, 13 May 2025 12:31:34 +0900 Subject: [PATCH 01/28] delete api_request delete api_request --- api_request/reliefweb.py | 56 ---------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 api_request/reliefweb.py diff --git a/api_request/reliefweb.py b/api_request/reliefweb.py deleted file mode 100644 index 1bcbd7f..0000000 --- a/api_request/reliefweb.py +++ /dev/null @@ -1,56 +0,0 @@ -import requests -import json - -# API 엔드포인트 -api_url = "https://api.reliefweb.int/v1/jobs?limit=10&offset=1120" -# API링크를 저장하기 위한 배열 -description_endpoint=[] -# API요청을 보내 -while api_url: # 다음으로 참고할 데이터가 없을 경우 조건문 종료 - response = requests.get(api_url) - if response.status_code==200: - data=response.json() - - links = data.get('links', {}) - if links: - next_link = links.get('next', None) - api_url = next_link.get('href', None) if next_link else None - else: - api_url = None # 'next'가 없으면 종료 조건으로 설정 - - - print(api_url) # 디버깅용 - jobs=data.get("data", []) - description_endpoint.append([job['href'] for job in jobs]) - - # for job in jobs: - # href = job.get("href", "No Link") - - # description_endpoint.append({"href": href}) - - - else: - print("API 요청 실패:", response.status_code, response.text) - break - -# print(description_endpoint) - -job_list=[] -flattened_data = [item for sublist in description_endpoint for item in sublist] - -# print(flattened_data) - -for info in flattened_data: - - response=requests.get(info) - - if response.status_code==200: - data=response.json() - jobs=data.get("data", []) - for job in jobs: - fields=job.get("fields", {}) - title=fields.get("title", "No title") - body=fields.get("body", "No body") - job_list.append({"title": title, "body": body}) - -print(json.dumps(job_list, indent=4, ensure_ascii=False)) \ No newline at end of file From 374d130c15ac1c1624c02e5bc76acae8ed78db65 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Tue, 13 May 2025 14:48:55 +0900 Subject: [PATCH 02/28] delete api_request delete api_request --- api_request/reliefweb.py | 56 ---------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 api_request/reliefweb.py diff --git a/api_request/reliefweb.py b/api_request/reliefweb.py deleted file mode 100644 index 1bcbd7f..0000000 --- a/api_request/reliefweb.py +++ /dev/null @@ -1,56 +0,0 @@ -import requests -import json - -# API 엔드포인트 -api_url = "https://api.reliefweb.int/v1/jobs?limit=10&offset=1120" -# API링크를 저장하기 위한 배열 -description_endpoint=[] -# API요청을 보내 -while api_url: # 다음으로 참고할 데이터가 없을 경우 조건문 종료 - response = requests.get(api_url) - if response.status_code==200: - data=response.json() - - links = data.get('links', {}) - if links: - next_link = links.get('next', None) - api_url = next_link.get('href', None) if next_link else None - else: - api_url = None # 'next'가 없으면 종료 조건으로 설정 - - - print(api_url) # 디버깅용 - jobs=data.get("data", []) - description_endpoint.append([job['href'] for job in jobs]) - - # for job in jobs: - # href = job.get("href", "No Link") - - # description_endpoint.append({"href": href}) - - - else: - print("API 요청 실패:", response.status_code, response.text) - break - -# print(description_endpoint) - -job_list=[] -flattened_data = [item for sublist in description_endpoint for item in sublist] - -# print(flattened_data) - -for info in flattened_data: - - response=requests.get(info) - - if response.status_code==200: - data=response.json() - jobs=data.get("data", []) - for job in jobs: - fields=job.get("fields", {}) - title=fields.get("title", "No title") - body=fields.get("body", "No body") - job_list.append({"title": title, "body": body}) - -print(json.dumps(job_list, indent=4, ensure_ascii=False)) \ No newline at end of file From c13d73d33a44f3687ba47b98e480dd7420be364c Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Tue, 13 May 2025 20:23:35 +0900 Subject: [PATCH 03/28] add docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docstring 추가 및 가장 높은 키워드 반환하도록 수정+O클래스 제거 --- custom_keyword/ext.py | 10 +++++++++- ocr/__init__.py | 9 ++++----- ocr/o.py | 19 +++++++++++++++++-- summarization/sum_translate.py | 7 +++++-- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/custom_keyword/ext.py b/custom_keyword/ext.py index aa88bcd..5dedb70 100644 --- a/custom_keyword/ext.py +++ b/custom_keyword/ext.py @@ -16,10 +16,18 @@ def calculate_cosine_similarity(vec1, vec2): return cosine_similarity(vec1, vec2) def extract_keywords(question: str): + """ + 텍스트로부터 키워드를 추출하는 함수 + + Args: + question (str): 키워드를 추출하고자 하는 문자열 + Returns: + str: 가장 유사도가 높은 키워드 + """ sentence_embedding = get_embeddings(question) domain_embeddings = [get_embeddings(keyword) for keyword in domain_keywords] similarities = [ (keyword, calculate_cosine_similarity(sentence_embedding, embedding)[0][0]) for keyword, embedding in zip(domain_keywords, domain_embeddings) ] - return sorted(similarities, key=lambda x: x[1], reverse=True) \ No newline at end of file + return max(similarities, key=lambda x: x[1])[0] diff --git a/ocr/__init__.py b/ocr/__init__.py index 588b198..d02d56a 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,7 +1,6 @@ -from flask import Blueprint, request, jsonify -from utils import confirm_no_data +from flask import Blueprint, jsonify from server.logger import logger -from .o import O +from .o import extract_text, compare_texts ocr_bp = Blueprint('ocr', __name__, url_prefix='/ocr') @@ -13,10 +12,10 @@ def evaluate_image(review_id): compare_text = "" # OCR 실행 - extracted_text = O.extract_text(img_path) + extracted_text = extract_text(img_path) # 비교 실행 - result = O.compare_texts(extracted_text, compare_text) + result = compare_texts(extracted_text, compare_text) try: return jsonify({"llm_validation": result, diff --git a/ocr/o.py b/ocr/o.py index a505897..526293b 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -8,12 +8,27 @@ ocr = PaddleOCR(lang="korean") def extract_text(img_path): - """ 이미지에서 텍스트 추출 """ + """ + 이미지에서 텍스트 추출 + + Args: + img_path (str): 이미지의 경로(url) + Returns: + str : ocr이미지에서 추출한 문자열 반환 + """ results = ocr.ocr(img_path, cls=True) return " ".join(text for result in results for _, (text, _) in result) def compare_texts(text1, text2): - """ 두 텍스트 간의 관계 분석 """ + """ + ocr로 추출한 텍스트와 활동 제목 간의 관계 분석 + + Args: + text1 (str): 이미지에서 추출한 문자열 + text2 (str): 활동 제목에서의 문자열 + Returns: + str: 관련이 있다 판단 시 True / 없다 판단 시 False를 반환 + """ prompt = f""" Analyze the relationship between the following two texts. Determine whether they are conceptually or contextually related. If they are related, return True; otherwise, return False without additional explanation diff --git a/summarization/sum_translate.py b/summarization/sum_translate.py index d9900bc..5c79249 100644 --- a/summarization/sum_translate.py +++ b/summarization/sum_translate.py @@ -5,8 +5,11 @@ def summarize_translate_en_to_ko(text: str) -> str: """ 영어 텍스트를 한국어로 번역하고 요약. - :param text: 번역 및 요약할 영어 문장 - :return: 요약된 한국어 번역 결과 + + Args: + text (str): 번역하고자 하는 원문(영어) 텍스트 + Returns: + str: 요약된 한국어 번역 결과 """ prompt = f""" Translate and summarize the following English text **into Korean** in **one or two sentences only**. From 98a25f7ef7b8d304f6db6c9a43e60ac691323c28 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Wed, 14 May 2025 12:17:54 +0900 Subject: [PATCH 04/28] add databaseconnection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. ocr에서 데이터베이스와 연결해 데이터를 가져오도록 코드 작성 2. api키를 env파일에서 가져오도록 변경 --- ocr/__init__.py | 11 ++++++++--- ocr/o.py | 7 ++++++- summarization/sum_translate.py | 9 ++++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index d02d56a..99c3114 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,4 +1,5 @@ from flask import Blueprint, jsonify +from server.db import run_query from server.logger import logger from .o import extract_text, compare_texts @@ -7,9 +8,13 @@ @ocr_bp.route('/', methods=['GET']) def evaluate_image(review_id): - # 데이터베이스에서 review_id를를 토대로 데이터를 가져옴(미구현) - img_path="" - compare_text = "" + img_query="""SELECT ri.image_urls + FROM reviews r + JOIN review_image_urls ri ON r.review_id = ri.review_id + WHERE r.review_id = '%s';""" + img_path = run_query(img_query, (review_id,)) + compare_query="SELECT activity_name FROM reviews WHERE review_id='%s';" + compare_text = run_query(compare_query, (review_id,)) # OCR 실행 extracted_text = extract_text(img_path) diff --git a/ocr/o.py b/ocr/o.py index 526293b..97e7894 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -1,9 +1,14 @@ import torch +import os +from dotenv import load_dotenv from paddleocr import PaddleOCR from openai import OpenAI +# .env파일 로드 +load_dotenv() -client = OpenAI(api_key="") # 나중에 api키 교체 +# 환경 변수에서 API 키 가져오기 +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) model = "gpt-4" ocr = PaddleOCR(lang="korean") diff --git a/summarization/sum_translate.py b/summarization/sum_translate.py index 5c79249..54efe88 100644 --- a/summarization/sum_translate.py +++ b/summarization/sum_translate.py @@ -1,5 +1,12 @@ from openai import OpenAI -client = OpenAI(api_key="") # 나중에 api_key 교체 +import os +from dotenv import load_dotenv + +# .env파일 로드 +load_dotenv() + +# 환경 변수에서 API 키 가져오기 +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) model = "gpt-4" def summarize_translate_en_to_ko(text: str) -> str: From 2582e7f4cd46469b3a645296e5b334366ade5efd Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Thu, 15 May 2025 11:43:28 +0900 Subject: [PATCH 05/28] add swagger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 스웨거 라이브러리 추가 2. ocr 이미지분석 api에 대한 docstring추가 3. ocr이미지분석 중 True/False이외의 값으로 반환할 경우 False로 반환하도록 설정 --- ocr/__init__.py | 32 +++++++++++++++++++++++++++++++- requirements.in | 1 + requirements.txt | 33 ++++++++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 99c3114..93d8a9d 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -7,7 +7,29 @@ @ocr_bp.route('/', methods=['GET']) def evaluate_image(review_id): - + """ + 이미지 평가 API + --- + parameters: + - name: review_id + in: path + type: string + required: true + description: 리뷰 ID + responses: + 200: + description: 성공적으로 평가됨 + schema: + type: object + properties: + llm_validation: + type: boolean + review_id: + type: string + 500: + description: 서버 오류 발생 + + """ img_query="""SELECT ri.image_urls FROM reviews r JOIN review_image_urls ri ON r.review_id = ri.review_id @@ -22,6 +44,14 @@ def evaluate_image(review_id): # 비교 실행 result = compare_texts(extracted_text, compare_text) + # 문자열 "True" 또는 "False"를 실제 Boolean 값으로 변환 + if result == "True": + result = True + elif result == "False": + result = False + else: + result = False # 예상치 못한 값이면 False로 처리 + try: return jsonify({"llm_validation": result, "review_id": review_id}), 200 diff --git a/requirements.in b/requirements.in index 523814a..3cca7c0 100644 --- a/requirements.in +++ b/requirements.in @@ -5,6 +5,7 @@ flask flask-cors requests python-dotenv +flasgger # Bert 임베딩 기반 유사도 추정 torch diff --git a/requirements.txt b/requirements.txt index 335edfc..b5cb415 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile +# pip-compile requirements.in # albucore==0.0.24 # via @@ -16,6 +16,10 @@ anyio==4.9.0 # via # httpx # openai +attrs==25.3.0 + # via + # jsonschema + # referencing beautifulsoup4==4.13.4 # via paddleocr blinker==1.9.0 @@ -44,9 +48,12 @@ filelock==3.18.0 # transformers fire==0.7.0 # via paddleocr +flasgger==0.9.7.1 + # via -r requirements.in flask==3.1.0 # via # -r requirements.in + # flasgger # flask-cors flask-cors==5.0.1 # via -r requirements.in @@ -83,6 +90,10 @@ jiter==0.9.0 # via openai joblib==1.5.0 # via scikit-learn +jsonschema==4.23.0 + # via flasgger +jsonschema-specifications==2025.4.1 + # via jsonschema lazy-loader==0.4 # via scikit-image lmdb==1.6.2 @@ -93,6 +104,8 @@ markupsafe==3.0.2 # via # jinja2 # werkzeug +mistune==3.1.3 + # via flasgger mpmath==1.3.0 # via sympy mysql-connector-python==9.3.0 @@ -128,6 +141,7 @@ opencv-python-headless==4.11.0.86 # albumentations packaging==25.0 # via + # flasgger # huggingface-hub # lazy-loader # scikit-image @@ -154,11 +168,16 @@ python-dotenv==1.1.0 pyyaml==6.0.2 # via # albumentations + # flasgger # huggingface-hub # paddleocr # transformers rapidfuzz==3.13.0 # via paddleocr +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications regex==2024.11.6 # via transformers requests==2.32.3 @@ -167,6 +186,10 @@ requests==2.32.3 # huggingface-hub # paddleocr # transformers +rpds-py==0.24.0 + # via + # jsonschema + # referencing safetensors==0.5.3 # via transformers scikit-image==0.25.2 @@ -182,6 +205,8 @@ shapely==2.1.0 # via paddleocr simsimd==6.2.1 # via albucore +six==1.17.0 + # via flasgger sniffio==1.3.1 # via # anyio @@ -219,6 +244,7 @@ typing-extensions==4.13.2 # pydantic # pydantic-core # python-docx + # referencing # torch # typing-inspection typing-inspection==0.4.0 @@ -229,6 +255,3 @@ werkzeug==3.1.3 # via # flask # flask-cors - -# The following packages are considered to be unsafe in a requirements file: -# setuptools From f15330ff89ab2e6527f59d9db02c40a8d25e6069 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Fri, 16 May 2025 10:50:14 +0900 Subject: [PATCH 06/28] add paddlepaddle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. paddlepaddle을 requirements.in에 추가 2.flasgger을 적용하기 위한 코드 추가 --- app.py | 3 +++ ocr/__init__.py | 4 ++-- requirements.in | 1 + requirements.txt | 19 ++++++++++++++++++- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index b7ca66e..e1ba0c4 100644 --- a/app.py +++ b/app.py @@ -3,6 +3,7 @@ from flask import Flask from flask_cors import CORS from dotenv import load_dotenv +from flasgger import Swagger from server.logger import logger @@ -23,6 +24,8 @@ app = Flask(__name__) CORS(app, resources={r"/*": {"origins": "*"}}) +swagger=Swagger(app) + # 모든 Blueprint 등록 from chat import chat_bp app.register_blueprint(chat_bp) diff --git a/ocr/__init__.py b/ocr/__init__.py index 93d8a9d..e37074d 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -13,7 +13,7 @@ def evaluate_image(review_id): parameters: - name: review_id in: path - type: string + type: "string" required: true description: 리뷰 ID responses: @@ -25,7 +25,7 @@ def evaluate_image(review_id): llm_validation: type: boolean review_id: - type: string + type: "string" 500: description: 서버 오류 발생 diff --git a/requirements.in b/requirements.in index 3cca7c0..9c29df6 100644 --- a/requirements.in +++ b/requirements.in @@ -17,6 +17,7 @@ openai # OCR paddleocr +paddlepaddle # MySQL mysql-connector-python diff --git a/requirements.txt b/requirements.txt index b5cb415..26bc0d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,8 @@ anyio==4.9.0 # via # httpx # openai +astor==0.8.1 + # via paddlepaddle attrs==25.3.0 # via # jsonschema @@ -39,6 +41,8 @@ colorama==0.4.6 # tqdm cython==3.1.0 # via paddleocr +decorator==5.2.1 + # via paddlepaddle distro==1.9.0 # via openai filelock==3.18.0 @@ -68,7 +72,9 @@ h11==0.16.0 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via openai + # via + # openai + # paddlepaddle huggingface-hub==0.31.1 # via # tokenizers @@ -112,6 +118,7 @@ mysql-connector-python==9.3.0 # via -r requirements.in networkx==3.4.2 # via + # paddlepaddle # scikit-image # torch numpy==2.2.5 @@ -122,7 +129,9 @@ numpy==2.2.5 # opencv-contrib-python # opencv-python # opencv-python-headless + # opt-einsum # paddleocr + # paddlepaddle # scikit-image # scikit-learn # scipy @@ -139,6 +148,8 @@ opencv-python-headless==4.11.0.86 # via # albucore # albumentations +opt-einsum==3.3.0 + # via paddlepaddle packaging==25.0 # via # flasgger @@ -148,11 +159,16 @@ packaging==25.0 # transformers paddleocr==2.10.0 # via -r requirements.in +paddlepaddle==3.0.0 + # via -r requirements.in pillow==11.2.1 # via # imageio # paddleocr + # paddlepaddle # scikit-image +protobuf==6.31.0 + # via paddlepaddle pyclipper==1.3.0.post6 # via paddleocr pydantic==2.11.4 @@ -241,6 +257,7 @@ typing-extensions==4.13.2 # beautifulsoup4 # huggingface-hub # openai + # paddlepaddle # pydantic # pydantic-core # python-docx From e0fb1278251495f32815a1ad3e11ad639833c67c Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Sat, 24 May 2025 18:42:41 +0900 Subject: [PATCH 07/28] include imagestream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.api반환에서 ocr결과를 인증기록과 수상기록ocr결과 두 개로 나누어 전송하도록 바꾸었습니다. 이를 반영하여 스웨거 또한 변경하였습니다. 2.s3에서 이미지를 가져오기 위해 boto3패키지를 추가하였습니다 3.이미지 스트림을 통해 이미지를 로컬에 저장하도록 하였습니다. --- ocr/__init__.py | 57 +++++++++++++++++++++++++--------------- ocr/o.py | 68 +++++++++++++++++++++++++++++++++++++++++++----- requirements.in | 2 ++ requirements.txt | 22 ++++++++++++++-- 4 files changed, 120 insertions(+), 29 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index e37074d..6da36ef 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,7 +1,7 @@ from flask import Blueprint, jsonify from server.db import run_query from server.logger import logger -from .o import extract_text, compare_texts +from .o import download_image, extract_text, compare_texts ocr_bp = Blueprint('ocr', __name__, url_prefix='/ocr') @@ -22,38 +22,53 @@ def evaluate_image(review_id): schema: type: object properties: - llm_validation: - type: boolean + ocr_result: + type: "string" + enum: ["True", "False"] + award_ocr_result: + type: "string" + enum: ["True", "False", "None"] review_id: type: "string" 500: description: 서버 오류 발생 """ - img_query="""SELECT ri.image_urls - FROM reviews r - JOIN review_image_urls ri ON r.review_id = ri.review_id - WHERE r.review_id = '%s';""" - img_path = run_query(img_query, (review_id,)) - compare_query="SELECT activity_name FROM reviews WHERE review_id='%s';" + # review_img_query="""SELECT ri.image_urls + # FROM reviews r + # JOIN review_image_urls ri ON r.review_id = ri.review_id + # WHERE r.review_id = %s;""" + review_img_query="""SELECT image_urls + FROM review_image_urls + WHERE HEX(review_id)=%s""" + review_img_path = run_query(review_img_query, (review_id,)) + + award_img_query = "SELECT award_image_url FROM reviews WHERE hex(review_id) = %s;" + award_img_path=run_query(award_img_query, (review_id, )) + + compare_query="SELECT activity_name FROM reviews WHERE hex(review_id)=%s" compare_text = run_query(compare_query, (review_id,)) # OCR 실행 - extracted_text = extract_text(img_path) - - # 비교 실행 - result = compare_texts(extracted_text, compare_text) - - # 문자열 "True" 또는 "False"를 실제 Boolean 값으로 변환 - if result == "True": - result = True - elif result == "False": - result = False + if review_img_path: + # ocr결과의 기본값은 False + ocr_result = "False" + for img_url in review_img_path: + image_stream = download_image(img_url) + extracted_text = extract_text(image_stream) + ocr_result = compare_texts(extracted_text, compare_text[0]) + if ocr_result == "True": + break + if award_img_path[0][0]: + award_image_stream = download_image(award_img_path[0]) + award_text = extract_text(award_image_stream) + award_ocr_result = compare_texts(award_text, compare_text[0]) else: - result = False # 예상치 못한 값이면 False로 처리 + award_ocr_result = "None" try: - return jsonify({"llm_validation": result, + return jsonify({"ocr_result": ocr_result, + "award_ocr_result": award_ocr_result, "review_id": review_id}), 200 except Exception as e: logger.error(e) diff --git a/ocr/o.py b/ocr/o.py index 97e7894..f7271fc 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -1,8 +1,12 @@ import torch import os +import boto3 +import numpy as np +import cv2 from dotenv import load_dotenv from paddleocr import PaddleOCR from openai import OpenAI +from io import BytesIO # .env파일 로드 load_dotenv() @@ -12,18 +16,70 @@ model = "gpt-4" ocr = PaddleOCR(lang="korean") -def extract_text(img_path): - """ - 이미지에서 텍스트 추출 +def download_image(img_path): + """ + s3에서 이미지 다운로드 후 벡터db나 tmp폴더에 저장 + + Args: + img_path (str): s3상에 이미지 경로 + Returns: + BytesIO: 이미지 데이터의 바이트스트림 객체 + """ + s3 = boto3.client( + 's3', + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + region_name=os.getenv("AWS_REGION") # 원하는 리전 + ) + bucket_name='trendist' + file_name = img_path[0] + # file_name = "award/0EEC67FEECF943B980D60BF3430FB213" + image_stream = BytesIO() + print(file_name) + s3.download_fileobj(bucket_name, file_name, image_stream) + image_stream.seek(0) + + return image_stream + + +# def extract_text(img_path): +# """ +# 이미지에서 텍스트 추출 + +# Args: +# img_path (str): 이미지의 로컬경로(url) +# Returns: +# str : ocr이미지에서 추출한 문자열 반환 +# """ +# results = ocr.ocr(img_path, cls=True) +# return " ".join(text for result in results for _, (text, _) in result) + +def extract_text(image_stream): + """ + BytesIO 객체의 이미지를 대상으로 OCR 수행 Args: - img_path (str): 이미지의 경로(url) + image_stream (BytesIO): 메모리에 저장된 이미지 데이터 Returns: - str : ocr이미지에서 추출한 문자열 반환 + list: OCR 결과 """ - results = ocr.ocr(img_path, cls=True) + # 스트림을 numpy 배열로 변환 + image_stream.seek(0) # 읽기 위치 초기화 + file_bytes = np.frombuffer(image_stream.getvalue(), dtype=np.uint8) + img = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) # OpenCV를 사용하여 이미지 디코딩 + print("ocr수행전전") + # OCR 수행 + ocr = PaddleOCR(lang='korean') # 언어 설정 가능 + results = ocr.ocr(img, cls=True) + print("ocr수행후") return " ".join(text for result in results for _, (text, _) in result) +# 예제 사용법 +# with open("sample.jpg", "rb") as f: +# image_stream = BytesIO(f.read()) +# ocr_result = perform_ocr(image_stream) +# print(ocr_result) + def compare_texts(text1, text2): """ ocr로 추출한 텍스트와 활동 제목 간의 관계 분석 diff --git a/requirements.in b/requirements.in index 9c29df6..f4c73c7 100644 --- a/requirements.in +++ b/requirements.in @@ -22,6 +22,8 @@ paddlepaddle # MySQL mysql-connector-python +#boto3 +boto3 # 새로운 패키지 # package-name diff --git a/requirements.txt b/requirements.txt index 26bc0d3..ee748ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,6 +26,12 @@ beautifulsoup4==4.13.4 # via paddleocr blinker==1.9.0 # via flask +boto3==1.38.23 + # via -r requirements.in +botocore==1.38.23 + # via + # boto3 + # s3transfer certifi==2025.1.31 # via # httpcore @@ -94,6 +100,10 @@ jinja2==3.1.6 # torch jiter==0.9.0 # via openai +jmespath==1.0.1 + # via + # boto3 + # botocore joblib==1.5.0 # via scikit-learn jsonschema==4.23.0 @@ -177,6 +187,8 @@ pydantic==2.11.4 # openai pydantic-core==2.33.2 # via pydantic +python-dateutil==2.9.0.post0 + # via botocore python-docx==1.1.2 # via paddleocr python-dotenv==1.1.0 @@ -206,6 +218,8 @@ rpds-py==0.24.0 # via # jsonschema # referencing +s3transfer==0.13.0 + # via boto3 safetensors==0.5.3 # via transformers scikit-image==0.25.2 @@ -222,7 +236,9 @@ shapely==2.1.0 simsimd==6.2.1 # via albucore six==1.17.0 - # via flasgger + # via + # flasgger + # python-dateutil sniffio==1.3.1 # via # anyio @@ -267,7 +283,9 @@ typing-extensions==4.13.2 typing-inspection==0.4.0 # via pydantic urllib3==2.3.0 - # via requests + # via + # botocore + # requests werkzeug==3.1.3 # via # flask From 1183181da491e956bc7c17148a64b9682c6bf88b Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 26 May 2025 19:32:03 +0900 Subject: [PATCH 08/28] =?UTF-8?q?ocr=20api=EC=A1=B0=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 백엔드측의 요청으로 api를 변경하였습니다 1.get방식->post 데이터베이스에 직접 접근 대신 body부분에 url을 전달하는 방식으로 변경 2.return값 변수명 변경 및 review_id제거 3.변경에 따른 변수 및 스웨거 조정 --- ocr/__init__.py | 116 ++++++++++++++++++++++++++++-------------------- ocr/o.py | 26 +---------- 2 files changed, 70 insertions(+), 72 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 6da36ef..7d5dd84 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,54 +1,75 @@ -from flask import Blueprint, jsonify +from flask import Blueprint, jsonify, request +from flasgger import Swagger, swag_from from server.db import run_query from server.logger import logger from .o import download_image, extract_text, compare_texts ocr_bp = Blueprint('ocr', __name__, url_prefix='/ocr') -@ocr_bp.route('/', methods=['GET']) -def evaluate_image(review_id): - """ - 이미지 평가 API - --- - parameters: - - name: review_id - in: path - type: "string" - required: true - description: 리뷰 ID - responses: - 200: - description: 성공적으로 평가됨 - schema: - type: object - properties: - ocr_result: - type: "string" - enum: ["True", "False"] - award_ocr_result: - type: "string" - enum: ["True", "False", "None"] - review_id: - type: "string" - 500: - description: 서버 오류 발생 +@ocr_bp.route('/', methods=['POST']) +@swag_from({ + 'summary': 'OCR 이미지 비교 API', + 'description': '이미지에서 텍스트를 추출하고 비교하는 API', + 'parameters': [ + { + 'name': 'body', + 'in': 'body', + 'required': True, + 'schema': { + 'type': 'object', + 'properties': { + 'image_urls': { + 'type': 'array', + 'items': {'type': 'string'}, + 'description': '검토할 이미지 URL 리스트' + }, + 'award_img_urls': { + 'type': 'string', + 'description': '수상 이미지의 URL' + }, + 'title': { + 'type': 'string', + 'description': '비교할 기준 텍스트' + } + } + } + } + ], + 'responses': { + 200: { + 'description': 'OCR 결과 반환', + 'schema': { + 'type': 'object', + 'properties': { + 'ocrResult': {'type': 'string', 'description': 'OCR 비교 결과'}, + 'awardOcrResult': {'type': 'string', 'description': '수상 이미지 OCR 비교 결과'} + } + } + }, + 500: { + 'description': '서버 에러 발생', + 'schema': { + 'type': 'object', + 'properties': { + 'answer': {'type': 'string', 'description': '에러 메시지'} + } + } + } + } +}) - """ - # review_img_query="""SELECT ri.image_urls - # FROM reviews r - # JOIN review_image_urls ri ON r.review_id = ri.review_id - # WHERE r.review_id = %s;""" - review_img_query="""SELECT image_urls - FROM review_image_urls - WHERE HEX(review_id)=%s""" - review_img_path = run_query(review_img_query, (review_id,)) +def evaluate_image(): - award_img_query = "SELECT award_image_url FROM reviews WHERE hex(review_id) = %s;" - award_img_path=run_query(award_img_query, (review_id, )) - compare_query="SELECT activity_name FROM reviews WHERE hex(review_id)=%s" - compare_text = run_query(compare_query, (review_id,)) + data=request.get_json() + review_img_path=data.get("image_urls") + award_img_path=data.get("award_img_urls") + compare_text=data.get("title") + + print(review_img_path) + print(award_img_path) + print(compare_text) # OCR 실행 if review_img_path: # ocr결과의 기본값은 False @@ -56,20 +77,19 @@ def evaluate_image(review_id): for img_url in review_img_path: image_stream = download_image(img_url) extracted_text = extract_text(image_stream) - ocr_result = compare_texts(extracted_text, compare_text[0]) + ocr_result = compare_texts(extracted_text, compare_text) if ocr_result == "True": break - if award_img_path[0][0]: - award_image_stream = download_image(award_img_path[0]) + if award_img_path != None: + award_image_stream = download_image(award_img_path) award_text = extract_text(award_image_stream) - award_ocr_result = compare_texts(award_text, compare_text[0]) + award_ocr_result = compare_texts(award_text, compare_text) else: award_ocr_result = "None" try: - return jsonify({"ocr_result": ocr_result, - "award_ocr_result": award_ocr_result, - "review_id": review_id}), 200 + return jsonify({"ocrResult": ocr_result, + "awardOcrResult": award_ocr_result}), 200 except Exception as e: logger.error(e) return jsonify({"answer": f"죄송합니다. 에러가 발생했습니다."}), 500 \ No newline at end of file diff --git a/ocr/o.py b/ocr/o.py index f7271fc..fecffc0 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -18,7 +18,7 @@ def download_image(img_path): """ - s3에서 이미지 다운로드 후 벡터db나 tmp폴더에 저장 + s3에서 이미지 다운로드 후 저장 Args: img_path (str): s3상에 이미지 경로 @@ -32,28 +32,14 @@ def download_image(img_path): region_name=os.getenv("AWS_REGION") # 원하는 리전 ) bucket_name='trendist' - file_name = img_path[0] - # file_name = "award/0EEC67FEECF943B980D60BF3430FB213" + file_name = img_path image_stream = BytesIO() - print(file_name) s3.download_fileobj(bucket_name, file_name, image_stream) image_stream.seek(0) return image_stream -# def extract_text(img_path): -# """ -# 이미지에서 텍스트 추출 - -# Args: -# img_path (str): 이미지의 로컬경로(url) -# Returns: -# str : ocr이미지에서 추출한 문자열 반환 -# """ -# results = ocr.ocr(img_path, cls=True) -# return " ".join(text for result in results for _, (text, _) in result) - def extract_text(image_stream): """ BytesIO 객체의 이미지를 대상으로 OCR 수행 @@ -67,19 +53,11 @@ def extract_text(image_stream): image_stream.seek(0) # 읽기 위치 초기화 file_bytes = np.frombuffer(image_stream.getvalue(), dtype=np.uint8) img = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) # OpenCV를 사용하여 이미지 디코딩 - print("ocr수행전전") # OCR 수행 ocr = PaddleOCR(lang='korean') # 언어 설정 가능 results = ocr.ocr(img, cls=True) - print("ocr수행후") return " ".join(text for result in results for _, (text, _) in result) -# 예제 사용법 -# with open("sample.jpg", "rb") as f: -# image_stream = BytesIO(f.read()) -# ocr_result = perform_ocr(image_stream) -# print(ocr_result) - def compare_texts(text1, text2): """ ocr로 추출한 텍스트와 활동 제목 간의 관계 분석 From 075df9d8b86d23b74ec9fda3cba96d6a947e61b1 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Thu, 29 May 2025 12:00:13 +0900 Subject: [PATCH 09/28] =?UTF-8?q?=EC=9D=B4=EB=AF=B8=EC=A7=80=20=EB=8B=A4?= =?UTF-8?q?=EC=9A=B4=EB=A1=9C=EB=93=9C=20=EB=B0=A9=EC=8B=9D=20=EB=B3=80?= =?UTF-8?q?=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit s3에서 boto객체를 만드는 것 대신 presignedurl을 이용하도록 변경 --- ocr/__init__.py | 9 +++++---- ocr/o.py | 24 +++++++++++------------- requirements.in | 4 ++-- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 7d5dd84..6426c86 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,6 +1,6 @@ from flask import Blueprint, jsonify, request from flasgger import Swagger, swag_from -from server.db import run_query + from server.logger import logger from .o import download_image, extract_text, compare_texts @@ -63,8 +63,8 @@ def evaluate_image(): data=request.get_json() - review_img_path=data.get("image_urls") - award_img_path=data.get("award_img_urls") + review_img_path=data.get("imageUrls") + award_img_path=data.get("awardImgUrl") compare_text=data.get("title") print(review_img_path) @@ -77,6 +77,7 @@ def evaluate_image(): for img_url in review_img_path: image_stream = download_image(img_url) extracted_text = extract_text(image_stream) + print(extracted_text) ocr_result = compare_texts(extracted_text, compare_text) if ocr_result == "True": break @@ -85,7 +86,7 @@ def evaluate_image(): award_text = extract_text(award_image_stream) award_ocr_result = compare_texts(award_text, compare_text) else: - award_ocr_result = "None" + award_ocr_result = "False" try: return jsonify({"ocrResult": ocr_result, diff --git a/ocr/o.py b/ocr/o.py index fecffc0..ee89ee3 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -1,8 +1,8 @@ import torch import os -import boto3 import numpy as np import cv2 +import requests from dotenv import load_dotenv from paddleocr import PaddleOCR from openai import OpenAI @@ -18,24 +18,22 @@ def download_image(img_path): """ - s3에서 이미지 다운로드 후 저장 + s3에서 이미지 다운로드 후 바이트스트림에 저장 Args: img_path (str): s3상에 이미지 경로 Returns: BytesIO: 이미지 데이터의 바이트스트림 객체 """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), - region_name=os.getenv("AWS_REGION") # 원하는 리전 - ) - bucket_name='trendist' - file_name = img_path - image_stream = BytesIO() - s3.download_fileobj(bucket_name, file_name, image_stream) - image_stream.seek(0) + + presigned_url = img_path + print("presigned_url", presigned_url) + # 이미지 다운로드 (바이너리 형태) + response = requests.get(presigned_url) + +# 응답 확인 및 메모리에 저장 + if response.status_code == 200: + image_stream = BytesIO(response.content) return image_stream diff --git a/requirements.in b/requirements.in index f4c73c7..d663ace 100644 --- a/requirements.in +++ b/requirements.in @@ -18,12 +18,12 @@ openai # OCR paddleocr paddlepaddle +# gpu 사용할 경우 해당 패키지 포함 +#paddlepaddle-gpu==2.5.0.post118 # MySQL mysql-connector-python -#boto3 -boto3 # 새로운 패키지 # package-name From c65ef565ea76fa9bea70879dea2c6d0ceb1bbdae Mon Sep 17 00:00:00 2001 From: urusekai Date: Fri, 30 May 2025 12:49:38 +0900 Subject: [PATCH 10/28] =?UTF-8?q?=ED=81=AC=EB=A1=A4=EB=9F=AC=EC=B6=94?= =?UTF-8?q?=EA=B0=80=20=EB=B0=8F=20db.py,=20ext.py,=20sum=5Ftranslate.py?= =?UTF-8?q?=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/bbc_crawler.py | 128 ++++++++++++++++++++++ crawler/idealist_crawler.py | 146 +++++++++++++++++++++++++ crawler/keyword_extractor.py | 84 +++++++++++++++ crawler/main_crawler.py | 23 ++++ crawler/save_to_db.py | 84 +++++++++++++++ crawler/unv_crawler.py | 112 +++++++++++++++++++ crawler/v1365_crawler.py | 173 ++++++++++++++++++++++++++++++ crawler/wevity_crawler.py | 190 +++++++++++++++++++++++++++++++++ custom_keyword/ext.py | 12 ++- server/db.py | 25 ++++- summarization/sum_translate.py | 15 ++- 11 files changed, 974 insertions(+), 18 deletions(-) create mode 100644 crawler/bbc_crawler.py create mode 100644 crawler/idealist_crawler.py create mode 100644 crawler/keyword_extractor.py create mode 100644 crawler/main_crawler.py create mode 100644 crawler/save_to_db.py create mode 100644 crawler/unv_crawler.py create mode 100644 crawler/v1365_crawler.py create mode 100644 crawler/wevity_crawler.py diff --git a/crawler/bbc_crawler.py b/crawler/bbc_crawler.py new file mode 100644 index 0000000..f254e29 --- /dev/null +++ b/crawler/bbc_crawler.py @@ -0,0 +1,128 @@ +import requests +from crawler.keyword_extractor import extract_keyword +from summarization.sum_translate import translate_en_to_ko +from crawler.save_to_db import save_issues +from bs4 import BeautifulSoup +from datetime import datetime +from server.db import run_query + +BASE_URL = 'https://web-cdn.api.bbci.co.uk/xd/content-collection/' +COLLECTIONS = { + 'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66', + 'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9', + 'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b', +} +HEADERS = { + 'User-Agent': 'Mozilla/5.0' +} +SIZE = 9 + +def get_last_issue_date(): + sql = """ + SELECT MAX(issue_date) + FROM issues; + """ + result = run_query(sql) + + if result and result[0][0]: + dt = result[0][0] + latest_issue_date = dt.strftime("%Y-%m-%d %H:%M:%S.%f") + return latest_issue_date + else: + return None + +def is_end(date, end_time): + date_dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f") + end_time_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f") + return date_dt <= end_time_dt + +def get_datetime(time): + dt = datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ") + return dt.strftime("%Y-%m-%d %H:%M:%S.%f") + +def get_content(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + content_divs = soup.find_all('div', attrs={'data-component': 'text-block'}) + contents = [div.get_text(strip=True) for div in content_divs] + full_content = '\n'.join(contents) if contents else "No Content" + + return full_content + +def get_articles(page, collection_id, end_time): + params = { + 'page': page, + 'size': SIZE, + } + + response = requests.get(BASE_URL + collection_id, params=params, headers=HEADERS) + + if not response: + return [] + + datas = response.json().get('data') + articles = [] + + for data in datas: + date = get_datetime(data['firstPublishedAt']) + + if end_time: + if is_end(date, end_time): + break + + title = translate_en_to_ko(data['title']) + keyword = extract_keyword(data['summary']) + summary = translate_en_to_ko(data['summary']) + url = "https://www.bbc.com" + data['path'] + image = data['indexImage']['model']['blocks']['src'] or None + + articles.append( + { + 'content': summary, + 'image_url': image, + 'issue_date': date, + 'keyword': keyword, + 'site_url': url, + 'title': title, + } + ) + print(f"[BBC] 크롤링 완료 : {title}") + + return articles + +def crawl(): + print("[BBC] 크롤링 시작") + results = [] + last_issue_date = get_last_issue_date() + + if last_issue_date: + print(f"[BBC] DB의 마지막 이슈 이후 데이터만 크롤링 시작 (DATE : {last_issue_date})") + else: + print(f"[BBC] DB에 이슈 없음, 모든 데이터 크롤링 시작") + + for category, collection_id in COLLECTIONS.items(): + # print(f"[BBC] 카테고리 {category} :") + page = 0 + + while True: + articles = get_articles(page, collection_id, last_issue_date) + + if not articles: + break + + results.extend(articles) + page += 1 + + if results: + print(f"[BBC] 크롤링 완료 : {len(results)}개의 이슈를 크롤링했습니다.") + save_issues(results) + else: + print("[BBC] 크롤링 완료 : 새로운 이슈가 없습니다.") + + + +def main(): + crawl() + +if __name__ == '__main__': + main() diff --git a/crawler/idealist_crawler.py b/crawler/idealist_crawler.py new file mode 100644 index 0000000..7313333 --- /dev/null +++ b/crawler/idealist_crawler.py @@ -0,0 +1,146 @@ +import requests +import json +from datetime import datetime, timedelta, timezone +from crawler.keyword_extractor import extract_keyword +from crawler.save_to_db import save_activities +from server.db import run_query + +ENDPOINT = "https://nsv3auess7-dsn.algolia.net/1/indexes/*/queries" +HEADERS = { + "Content-Type": "application/json", + "x-algolia-agent": "Algolia for JavaScript (5.20.0); Search (5.20.0); Browser", + "x-algolia-api-key": "c2730ea10ab82787f2f3cc961e8c1e06", + "x-algolia-application-id": "NSV3AUESS7" +} +DEFAULT_IMAGE_URL = "https://www.idealist.org/assets/417d88fd628db1c1ac861f3ea8db58c1a159d52a/images/icons/action-opps/action-opps-volunteermatch.svg" + +def get_last_timestamp(): + sql = """ + SELECT start_date + FROM activities + WHERE activity_site = 'IDEALIST' + ORDER BY start_date DESC + LIMIT 1; + """ + last_timestamp = run_query(sql) + + if last_timestamp: + dt = last_timestamp[0][0].replace(tzinfo=timezone.utc) + return int(dt.timestamp()) + else: + return 0 + +def build_payload(page, type='volunteer', timestamp=0): + if type == 'volunteer': + filters = f"actionType:'VOLOP' AND published > {timestamp}" + index_name = "idealist7-production-action-opps" + else: + filters = f"type:'INTERNSHIP' AND published > {timestamp}" + index_name = "idealist7-production" + + return { + "requests": [ + { + "indexName": index_name, + "facets": ["*"], + "hitsPerPage": 100, + "attributesToSnippet": ["description:20"], + "attributesToRetrieve": ["*"], + "filters": filters, + "removeStopWords": True, + "ignorePlurals": True, + "advancedSyntax": True, + "queryLanguages": ["en"], + "page": page, + "query": "", + "getRankingInfo": True, + "clickAnalytics": True, + "analytics": True + } + ] + } + +def get_url(item): + url = item.get("url") + if isinstance(url, str): + return url + elif isinstance(url, dict): + return "https://www.idealist.org" + next(iter(url.values()), "") + return "" + +def get_image(item): + img = item.get("imageUrl") or DEFAULT_IMAGE_URL + return img + +def get_published(item): + timestamp = item.get("published") + return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f') + +def get_activities(page, timestamp, type): + payload = build_payload(page, type, timestamp) + response = requests.post(ENDPOINT, headers=HEADERS, json=payload) + + try: + data = response.json()["results"][0]["hits"] + except Exception as e: + print(f"[!] JSON 파싱 에러: {e}") + return None + + result = [] + + if data: + for item in data: + activity_type = "VOLUNTEER" if type=='volunteer' else 'INTERNSHIP' + activity_content = item.get("description") + activity_name = item.get("name") + activity_image_url = get_image(item) + activity_url = get_url(item) + start_date = get_published(item) + end_date = None + keyword = extract_keyword(activity_content) + + result.append( + { + "activity_site": "IDEALIST", + "activity_type": activity_type, + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": activity_image_url, + "keyword": keyword, + "activity_name": activity_name, + "site_url": activity_url, + "start_date": start_date + } + ) + print(f"[IDEALIST] 크롤링 완료 : {item.get("name", '')}") + return result + else: + return None + +def crawl(): + print("[IDEALIST] 크롤링 시작") + crawled_activities = [] + last_timestamp = get_last_timestamp() + + if last_timestamp > 0: + print(f"[IDEALIST] DB의 마지막 활동 이후 데이터만 크롤링 시작 (TIMESTAMP: {last_timestamp})") + else: + print(f"[IDEALIST] DB에 활동 없음, 모든 데이터 크롤링 시작") + + for type in ['volunteer', 'internship']: + page = 0 + while True: + activities = get_activities(page, last_timestamp, type) + if not activities: + break + crawled_activities.extend(activities) + page += 1 + + if crawled_activities: + print(f"[IDEALIST] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.") + save_activities(crawled_activities) + else: + print("[IDEALIST] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() diff --git a/crawler/keyword_extractor.py b/crawler/keyword_extractor.py new file mode 100644 index 0000000..8d817a1 --- /dev/null +++ b/crawler/keyword_extractor.py @@ -0,0 +1,84 @@ +import requests +import os +from dotenv import load_dotenv + +# .env 파일에서 환경변수 로드 +load_dotenv() + +# 키워드 후보 +KEYWORDS = ['Economy','Environment','PeopleAndSociety','Technology'] +MODEL = 'gemini-2.0-flash-lite' + +def extract_keyword(text: str) -> str: + """ + 봉사활동 내용을 입력받아 적절한 키워드를 반환합니다. + + Parameters: + text (str): 봉사활동 내용 + + Returns: + str: 봉사활동 내용에 맞는 키워드 + """ + + # Gemini API 키 가져오기 + api_key = os.getenv('GEMINI_API_KEY') + if not api_key: + raise ValueError("GEMINI_API_KEY 환경변수가 설정되지 않았습니다.") + + # API 엔드포인트 URL + url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={api_key}" + + # 프롬프트 작성 + prompt = f""" +Read the following volunteer activity description and choose the **most appropriate keyword** from the provided list. + +Only output **one keyword**, exactly as it appears in the list. Do not add any extra words or punctuation. + +Volunteer Description: +{text} + +Keyword List: +{', '.join(KEYWORDS)} +""" + + # API 요청 데이터 준비 + payload = { + "contents": [{ + "parts": [{ + "text": prompt + }] + }] + } + + # API 호출 + headers = {'Content-Type': 'application/json'} + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() # HTTP 에러 체크 + + # 응답 파싱 + result = response.json() + if 'candidates' in result and len(result['candidates']) > 0: + generated_text = result['candidates'][0]['content']['parts'][0]['text'] + # 응답에서 키워드만 추출 (앞뒤 공백 제거) + keyword = generated_text.strip() + + # 추출된 키워드가 후보 목록에 있는지 확인 + if keyword in KEYWORDS: + return keyword + else: + return KEYWORDS[0] # 기본값으로 첫 번째 키워드 반환 + + except Exception as e: + print(f"API 호출 중 오류 발생: {e}") + return KEYWORDS[0] # 오류 발생 시 기본값으로 첫 번째 키워드 반환 + + return KEYWORDS[0] # 기본값으로 첫 번째 키워드 반환 + +if __name__ == "__main__": + # 테스트용 예시 + text = """ + 'Are you passionate about creating a positive change in society? Our CBS-featured non-profit wants you to join us in making a difference. About Us: Bright Mind is an award-winning non-profit organization recognized for our innovative initiatives such as Wellness Week and Street Care. Our outreach has reached up to 60 million people and has been featured on CBS, Politico, ABC, and Newsweek. We are looking for a passionate and versatile volunteer to join our team. If you have a desire to make a positive impact in the lives of those experiencing homelessness, we would love to hear from you! Position Overview ● Bright Mind is seeking dedicated and compassionate individuals to join our Street Care team as Homelessness Volunteers. ● In this role, you will have the opportunity to make a tangible difference in the lives of those experiencing homelessness. ● You will work closely with our Community Outreach team to provide support, resources, and advocacy for homeless individuals and families. ● We have decades of experience providing aid to homeless and highly at risk people, and our program always places safety first. ● We have a variety of openings, whether you’re interested in going out on the street or looking to help in other ways. Key Responsibilities ● Direct Support: ○ Engage with homeless individuals and families to assess their needs and provide appropriate support. ○ Distribute essential items such as food, clothing, hygiene products, and blankets. ● Resource Connection: ○ Connect individuals with local services, including housing, medical care, job training, and mental health support. ○ Provide information about available resources and help individuals navigate the social services system. ● Advocacy and Education: ○ Participate in community education programs to inform the public about homelessness issues and how they can help. ○ Work with local businesses and organizations to secure support (notably in-kind, such as food and clothing) and collaborate on our homeless initiatives. ● Event Coordination: ○ Assist in organizing and executing events such as donation drives, community meals, and health fairs. ○ Support the planning and logistics of outreach activities and special programs. ● Data Collection and Reporting: ○ Maintain accurate records of interactions and services provided to homeless individuals. ○ Assist with data collection and reporting to help track the impact of Bright Mind’s homelessness programs. Qualifications ● Skills and Competencies: ○ Strong interpersonal and communication skills. ○ Empathy, patience, and a non-judgmental attitude towards individuals experiencing homelessness. ○ Ability to work independently and as part of a team. ○ Flexibility and adaptability in a dynamic work environment. ○ Basic knowledge of social services and resources available for homeless individuals (preferred but not required). ● Experience: ○ Previous volunteer experience, especially in community outreach or working with vulnerable populations, is preferred but not required. ○ Experience in event coordination, advocacy, or data collection is a plus. ● Education: ○ Relevant coursework or training in social work, psychology, or a related field is welcomed. Benefits ● Opportunity to make a meaningful impact in the community. ● Hands-on experience in community outreach and social services. ● Professional development and training opportunities. ● Flexible volunteer schedules to accommodate your availability. Note: This is an unpaid position. Contact Us Please reach out to us at info@brightmindenrichment.org. To apply for this position, email your resume to hr@brightmindenrichment.org. Learn more about our initiatives at Street Care (https://streetcare.us/) and Bright Mind (https://brightmindenrichment.org/). Bright Mind is a federally-recognized 501(c)(3) wellness education non-profit and recipient of awards and certifications in recognition of our achievements.' + """ + keyword = extract_keyword(text) + print(f"선택된 키워드: {keyword}") \ No newline at end of file diff --git a/crawler/main_crawler.py b/crawler/main_crawler.py new file mode 100644 index 0000000..8fc4910 --- /dev/null +++ b/crawler/main_crawler.py @@ -0,0 +1,23 @@ +from crawler.bbc_crawler import crawl as bbc_crawler +from crawler.wevity_crawler import crawl as wevity_crawler +from crawler.idealist_crawler import crawl as idealist_crawler +from crawler.unv_crawler import crawl as unv_crawler +from crawler.v1365_crawler import crawl as v1365_crawler + +if __name__ == "__main__": + # BBC News + # bbc_crawler() + + # WEVITY + wevity_crawler() + + # 1365 + v1365_crawler() + + # IDEALIST + # idealist_crawler() + + # UNVOLUNTEERS + # unv_crawler() + + \ No newline at end of file diff --git a/crawler/save_to_db.py b/crawler/save_to_db.py new file mode 100644 index 0000000..94ea4cc --- /dev/null +++ b/crawler/save_to_db.py @@ -0,0 +1,84 @@ +from server.db import run_query +import uuid + +def save_issues(issues): + if not issues: + print("[DB] 저장할 이슈가 없습니다.") + return + + print("[DB] 크롤링한 이슈 DB 저장 중...") + + sql = """ + INSERT IGNORE INTO issues ( + issue_id, + created_at, + content, + image_url, + issue_date, + keyword, + site_url, + title + ) VALUES (%s, UTC_TIMESTAMP(6), %s, %s, %s, %s, %s, %s) + """ + + values = [ + ( + uuid.uuid4().bytes, + issue['content'], + issue['image_url'], + issue['issue_date'], + issue['keyword'], + issue['site_url'], + issue['title'] + ) + for issue in issues + ] + + if values: + saved_rows = run_query(sql, values) + print(f"[DB] {saved_rows}개의 이슈가 저장되었습니다.") + +def save_activities(activities): + if not activities: + print("[DB] 저장할 활동이 없습니다.") + return + + print("[DB] 크롤링한 활동 DB 저장 중...") + + sql = """ + INSERT IGNORE INTO activities ( + created_at, + end_date, + start_date, + activity_id, + activity_image_url, + activity_name, + site_url, + activity_content, + activity_site, + activity_type, + keyword + ) VALUES (UTC_TIMESTAMP(6), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + + values = [ + ( + activity['end_date'], + activity['start_date'], + uuid.uuid4().bytes, + activity['activity_image_url'], + activity['activity_name'], + activity['site_url'], + activity['activity_content'], + activity['activity_site'], + activity['activity_type'], + activity['keyword'] + ) + for activity in activities + ] + + if values: + saved_rows = run_query(sql, values) + print(f"[DB] {saved_rows}개의 활동이 저장되었습니다.") + + diff --git a/crawler/unv_crawler.py b/crawler/unv_crawler.py new file mode 100644 index 0000000..beea769 --- /dev/null +++ b/crawler/unv_crawler.py @@ -0,0 +1,112 @@ +import requests +from datetime import datetime, timezone +from crawler.keyword_extractor import extract_keyword +from crawler.save_to_db import save_activities +from server.db import run_query + +PAGE_ENDPOINT = "https://app.unv.org/api/doa/doa/SearchDoaAsyncByAzureCognitive" +DETAIL_ENDPOINT = "https://app.unv.org/api/doa/doa/" +URL_BASE = "https://app.unv.org/opportunities/" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +DEFAULT_IMAGE_URL = "https://www.unv.org/sites/default/files/unvol.png" + +def get_latest_activity_id(): + query = """ + SELECT CAST(SUBSTRING_INDEX(site_url, '/', -1) AS UNSIGNED) as activity_id + FROM activities + WHERE activity_site = "UNVOLUNTEERS" + ORDER BY activity_id DESC + LIMIT 1 + """ + result = run_query(query) + + return int(result[0][0]) if result else 0 + +def get_total_count(): + payload = { + "take": 1, + "skip": 0 + } + response = requests.post(PAGE_ENDPOINT, headers=HEADERS, json=payload) + data = response.json() + total_count = data["value"]["total"] + + return total_count + +def iso_to_utc(date_str): + if not date_str: + return None + + return datetime.fromisoformat(date_str) + +def fetch_activity_id_list(): + latest_activity_id = get_latest_activity_id() + total_count = get_total_count() + + # API 요청 + response = requests.post( + PAGE_ENDPOINT, + headers=HEADERS, + json={"skip": 0, "take": total_count} + ) + activities = response.json()["value"]["result"] + + # 마지막 활동이 있으면 그 이후의 데이터만, 없으면 전체 데이터를 가져옴 + if latest_activity_id > 0: + print(f"[UNV] DB의 마지막 활동 이후 데이터만 크롤링 시작 (ID : {latest_activity_id})") + return [activity["id"] for activity in activities if activity["id"] > latest_activity_id] + else: + print(f"[UNV] DB에 활동 없음, 모든 데이터 크롤링 시작") + return [activity["id"] for activity in activities] + +def fetch_activity_detail(activity_id_list): + activities = [] + + for activity_id in activity_id_list: + response = requests.get(DETAIL_ENDPOINT + str(activity_id), headers=HEADERS) + data = response.json()['value'] + + activity_content = ( + f"[Mission and objectives] : {data.get('organizationMission', '')}" + f"[Context] : {data.get('context', '')}" + f"[Task description] : {data.get('taskDescription', '')}" + f"[Required experience]: {data.get('requiredSkillExperience', '')}" + ) + activity_name = data.get("name") + start_date = iso_to_utc(data.get("publishDate")) + end_date = data.get("sourcingEndDate") + site_url = URL_BASE + str(activity_id) + keyword = extract_keyword(data.get('organizationMission') or activity_name) + + activities.append( + { + "activity_site": "UNVOLUNTEERS", + "activity_type": "VOLUNTEER", + "activity_content": activity_content, + "end_date": end_date, + "site_url": site_url, + "activity_image_url": DEFAULT_IMAGE_URL, + "keyword": keyword, + "activity_name": activity_name, + "start_date": start_date + }) + + print(f"[UNV] 활동 크롤링 완료 : {activity_name}") + + return activities + +def crawl(): + print("[UNV] 크롤링 시작") + activity_id_list = fetch_activity_id_list() + + if activity_id_list: + activities = fetch_activity_detail(activity_id_list) + print(f"[UNV] 크롤링 완료 : {len(activity_id_list)}개의 활동을 크롤링했습니다.") + save_activities(activities) + else: + print("[UNV] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() \ No newline at end of file diff --git a/crawler/v1365_crawler.py b/crawler/v1365_crawler.py new file mode 100644 index 0000000..780fd56 --- /dev/null +++ b/crawler/v1365_crawler.py @@ -0,0 +1,173 @@ +import re +import httpx +import asyncio +import requests +from bs4 import BeautifulSoup +from server.db import run_query +from crawler.save_to_db import save_activities +from crawler.keyword_extractor import extract_keyword +from itertools import chain + +LIST_ENDPOINT = "https://www.1365.go.kr/vols/1572247904127/partcptn/timeCptn.do" +DETAIL_ENDPOINT = "https://www.1365.go.kr/vols/1572247904127/partcptn/timeCptn.do?type=show&progrmRegistNo=" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +DEFAULT_IMAGE_URL = "https://play-lh.googleusercontent.com/9Kheg_iekobkZlP9XzKtwv_j_YL88oVzHCtHe4_hIL3JcQabCL3FFEw4vKzL1XQc8GE" +BATCH_SIZE = 5 # 한번에 BATCH_SIZE개의 HTTP 요청을 보냄 +MAX_CRAWL_PAGE = 10 # 크롤링할 페이지 수 + +async def get_soup(url, params=None): + """URL에 GET 요청을 보내고 BeautifulSoup 객체를 반환""" + async with httpx.AsyncClient() as client: + response = await client.get(url, params=params, headers=HEADERS) + return BeautifulSoup(response.text, "html.parser") + +def get_exist_ids(): + """DB에서 이미 존재하는 모든 활동 ID들을 리스트로 반환""" + sql = """ + SELECT CAST( + SUBSTRING_INDEX(site_url, 'progrmRegistNo=', -1) AS UNSIGNED) AS id + FROM activities + WHERE activity_site = "KRVOLUNTEERS" + ORDER BY id DESC + """ + result = run_query(sql) + return [int(row[0]) for row in result] if result else [] + +def get_last_page(): + """1365사이트의 마지막 페이지 번호를 반환""" + params = { + "requstSe": "N", + "adultPosblAt": "Y", + "yngbgsPosblAt": "Y", + } + response = requests.get(LIST_ENDPOINT, params=params, headers=HEADERS) + soup = BeautifulSoup(response.text, "html.parser") + + btn_last = soup.find('a', class_='btn_last') + last_page = btn_last.get('href').split('=')[-1] + + return int(last_page) + +def extract_name(soup): + """상세 페이지에서 활동 이름을 추출""" + name = soup.select_one('h3.tit_board_view input').get('value') + return name if name else None + +def extract_dates(soup): + """상세 페이지에서 봉사기간 시작일, 종료일을 추출""" + period = soup.find('dt', string='봉사기간') + if period: + period = period.find_next('dd').text + start_date, end_date = period.split(' ~ ') + return start_date.replace('.', '-'), end_date.replace('.', '-') + return None, None + +def extract_content(soup): + """상세 페이지에서 활동 내용을 추출""" + pre_tag = soup.find('pre') + if pre_tag: + return re.sub(r'[\r\n]+', ' ', pre_tag.get_text(separator="\n", strip=True)) + return "" + +async def extract_ids(page): + """해당 페이지의 활동 ID들을 리스트 형태로 반환""" + params = { + "cPage": page, + "requstSe": "N", + "adultPosblAt": "Y", + "yngbgsPosblAt": "Y", + } + soup = await get_soup(LIST_ENDPOINT, params=params) + + id_list = [] + ul = soup.select_one("ul.list_wrap.wrap2") + if ul: + a_tags = ul.find_all("a", href=True) + for a in a_tags: + href = a['href'] + match = re.search(r'show\((\d+)\)', href) + if match: + id = int(match.group(1)) + id_list.append(id) + + return id_list + +async def fetch_detail(id): + """해당 ID에 해당하는 활동의 상세정보를 추출""" + url = f"{DETAIL_ENDPOINT}{id}" + soup = await get_soup(url) + if not soup: + return None + + start_date, end_date = extract_dates(soup) + activity_content = extract_content(soup) + keyword = extract_keyword(activity_content) + activity_name = extract_name(soup) + + return { + "activity_site": "KRVOLUNTEERS", + "activity_type": "VOLUNTEER", + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": DEFAULT_IMAGE_URL, + "keyword": keyword, + "activity_name": activity_name, + "site_url": url, + "start_date": start_date + } + +async def crawl_async(): + """비동기적으로 1365 자원봉사 사이트에서 활동 정보를 수집""" + last_page = get_last_page() + start_page = max(last_page - MAX_CRAWL_PAGE, 1) # 시작할 페이지 계산 + exist_ids = get_exist_ids() + id_list = [] + activities = [] + print(f"[1365] 최근 {MAX_CRAWL_PAGE} 개의 페이지 ({start_page} ~ {last_page}) 에서 ID 수집중... ") + + # ID 수집 (start_page부터 last_page까지 BATCH_SIZE씩 증가) + for start in range(start_page, last_page + 1, BATCH_SIZE): + tasks = [] + end = min(start + BATCH_SIZE, last_page + 1) + + for current_page in range(start, end): + tasks.append(extract_ids(current_page)) + + result = await asyncio.gather(*tasks) + id_list.extend(chain.from_iterable(result)) + + # DB와 비교하여 새로운 ID만 남김 + filtered_id_list = list(set(id_list) - set(exist_ids)) + if not filtered_id_list: + return [] + print(f"[1365] {len(filtered_id_list)} 개의 새로운 활동 ID 수집 완료") + + # DB에 없는 새로운 활동의 상세정보 수집 (BATCH_SIZE 단위로) + for i in range(0, len(filtered_id_list), BATCH_SIZE): + batch = filtered_id_list[i:i + BATCH_SIZE] + detail_tasks = [fetch_detail(id) for id in batch] + try: + print(f"[1365] {len(filtered_id_list)} 개의 활동 중 {i+1} ~ {i+BATCH_SIZE} 의 상세정보 수집 중...") + results = await asyncio.gather(*detail_tasks) + # None이 아닌 결과만 추가 + activities.extend([r for r in results if r is not None]) + except Exception as e: + print(f"Error processing batch {i}: {e}") + continue + + return activities + +def crawl(): + """외부 호출용 크롤링 함수""" + print("[1365] 크롤링 시작") + activities = asyncio.run(crawl_async()) + if activities: + print(f"[1365] 크롤링 완료 : {len(activities)}개의 활동을 크롤링했습니다.") + save_activities(activities) + else: + print("[1365] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == '__main__': + crawl() \ No newline at end of file diff --git a/crawler/wevity_crawler.py b/crawler/wevity_crawler.py new file mode 100644 index 0000000..c682a4d --- /dev/null +++ b/crawler/wevity_crawler.py @@ -0,0 +1,190 @@ +import requests +import time +from bs4 import BeautifulSoup +import re +from datetime import datetime +from urllib.parse import urlparse, parse_qs +from server.db import run_query +from crawler.save_to_db import save_activities +from crawler.keyword_extractor import extract_keyword + +BASE_URL = "https://www.wevity.com" +FILE_NAME = "data/wevity_data.json" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +MAX_CRAWL_PAGE = 10 + +def get_soup(url): + """웹 페이지를 요청하고 BeautifulSoup 객체 반환""" + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + +def is_special_activity(a_tag): + """SPECIAL 게시물인지 확인""" + return bool(a_tag.select_one("span.stat.spec")) + +def get_latest_activity_id(): + """DB에서 가장 마지막 활동을 조회""" + sql = """ + SELECT CAST( + SUBSTRING_INDEX( + SUBSTRING_INDEX(site_url, 'ix=', -1), + '&', 1 + ) AS UNSIGNED + ) as activity_id + FROM activities + WHERE activity_site = "WEVITY" + ORDER BY activity_id DESC + LIMIT 1; + """ + result = run_query(sql) + + if result and result[0][0]: + return int(result[0][0]) + else: + return 0 + +def get_image_url(soup): + """썸네일 이미지 URL 추출""" + img_tag = soup.select_one("div.thumb img") + if not img_tag or not img_tag.has_attr("src"): + return "" + img_src = img_tag["src"] + return BASE_URL + img_src if img_src.startswith("/") else img_src + +def get_date_range(soup): + """접수기간 추출""" + for li in soup.select("li"): + if "접수기간" in li.get_text(): + match = re.search(r'(\d{4}-\d{2}-\d{2})\s*~\s*(\d{4}-\d{2}-\d{2})', li.get_text()) + if match: + try: + start_date = datetime.strptime(match.group(1), "%Y-%m-%d").replace(hour=0, minute=0, second=0, microsecond=0).isoformat() + end_date = datetime.strptime(match.group(2), "%Y-%m-%d").replace(hour=23, minute=59, second=59, microsecond=999999).isoformat() + return start_date, end_date + except ValueError: + pass + return None, None + +def get_activity_type(soup): + """활동 유형(카테고리) 결정""" + li_tag = soup.select_one("ul.cd-info-list li") + + if li_tag: + span_tag = li_tag.find("span", class_="tit") + span_tag.decompose() + category_text = li_tag.get_text(strip=True) + + if category_text == "대외활동/서포터즈": + return "SUPPORTERS" + elif category_text == "봉사활동": + return "VOLUNTEER" + else: + return "CONTEST" + +def get_activity_urls(list_url, last_activity_id): + """활동 목록 페이지에서 새로운 활동 URL들을 수집""" + activity_urls = [] + soup = get_soup(list_url) + activity_items = soup.select("ul.list li") + + for item in activity_items: + # 진행 중인 게시물만 처리 + if item.select_one("span.dday.end"): + continue + + link_tag = item.select_one("a") + if not link_tag: + continue + + activity_url = BASE_URL + link_tag['href'] + # URL에서 활동 ID 추출 + parsed = urlparse(activity_url) + query_params = parse_qs(parsed.query) + current_activity_id = int(query_params.get('ix', ['0'])[0]) + + # ID값이 ID의 마지막 활동 id보다 크면 추가 + if current_activity_id > last_activity_id: + activity_urls.append(activity_url) + # 특별 게시물이 아닌 경우에 ix 값이 작거나 같으면 더 이상 새로운 게시물이 없으므로 종료 + elif not is_special_activity(link_tag): + return activity_urls + + return activity_urls + +def get_activity_detail(url): + """활동 상세 페이지에서 데이터 추출""" + try: + soup = get_soup(url) + + activity_type = get_activity_type(soup) + activity_content = soup.select_one("#viewContents").get_text(strip=True) or None + activity_name = soup.select_one("h6.tit").get_text(strip=True) or None + start_date, end_date = get_date_range(soup) + activity_image_url = get_image_url(soup) + keyword = extract_keyword(activity_content) + + return { + "activity_site": "WEVITY", + "activity_type": activity_type, + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": activity_image_url, + "keyword": keyword, + "activity_name": activity_name, + "site_url": url, + "start_date": start_date + } + + except Exception as e: + print(f"[ERROR] {url} 에서 오류 발생: {e}") + return None + +def crawl(): + """위비티 활동 크롤링 실행""" + print("[WEVITY] 크롤링 시작") + + last_activity_id = get_latest_activity_id() + + if last_activity_id > 0: + print(f"[WEVITY] DB의 마지막 활동 이후 데이터만 크롤링 시작 (ID : {last_activity_id})") + else: + print(f"[WEVITY] DB에 활동 없음, 모든 데이터 크롤링 시작") + + collected_urls = [] + page = 1 + + print("[WEVITY] 페이지별 활동 링크 수집 중...") + while True and page <= MAX_CRAWL_PAGE: + paged_url = f"{BASE_URL}/?c=find&s=1&gp={str(page)}" + try: + new_urls = get_activity_urls(paged_url, last_activity_id) + if not new_urls: + break + collected_urls.extend(new_urls) + page += 1 + except Exception as e: + print(f"[ERROR] 목록 페이지 {paged_url} 에서 오류 발생: {e}") + break + + crawled_activities = [] + if collected_urls: + print(f"[WEVITY] {len(collected_urls)}개의 활동 링크 수집 완료") + print("[WEVITY] 활동 상세내용 크롤링 중...") + for url in collected_urls: + activity_data = get_activity_detail(url) + time.sleep(1.1) # LLM API 요청 간 시간 간격을 두기 위해 1.1초 대기 + if activity_data: + crawled_activities.append(activity_data) + print(f"[WEVITY] 활동 크롤링 완료 : {activity_data['activity_name']}") + + if crawled_activities: + print(f"[WEVITY] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.") + save_activities(crawled_activities) + else: + print("[WEVITY] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() diff --git a/custom_keyword/ext.py b/custom_keyword/ext.py index 5dedb70..d413803 100644 --- a/custom_keyword/ext.py +++ b/custom_keyword/ext.py @@ -1,11 +1,9 @@ from transformers import BertTokenizer, BertModel from sklearn.metrics.pairwise import cosine_similarity -import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') -domain_keywords = ["environment", "Society", "Economic", "technology"] +domain_keywords = ["Economy", "Environment", "Technology", "People", "Society"] def get_embeddings(text: str): inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) @@ -22,7 +20,7 @@ def extract_keywords(question: str): Args: question (str): 키워드를 추출하고자 하는 문자열 Returns: - str: 가장 유사도가 높은 키워드 + str: 가장 유사도가 높은 키워드 (DB enum 형식) """ sentence_embedding = get_embeddings(question) domain_embeddings = [get_embeddings(keyword) for keyword in domain_keywords] @@ -30,4 +28,8 @@ def extract_keywords(question: str): (keyword, calculate_cosine_similarity(sentence_embedding, embedding)[0][0]) for keyword, embedding in zip(domain_keywords, domain_embeddings) ] - return max(similarities, key=lambda x: x[1])[0] + extracted_keyword = max(similarities, key=lambda x: x[1])[0] + if extracted_keyword.lower() in ["people", "society"]: + return "PeopleAndSociety" + else: + return extracted_keyword diff --git a/server/db.py b/server/db.py index 8bb8d5b..eb8ff68 100644 --- a/server/db.py +++ b/server/db.py @@ -16,8 +16,23 @@ def run_query(query: str, params=None): conn = pool.get_connection() cursor = conn.cursor() - cursor.execute(query, params) - results = cursor.fetchall() - cursor.close() - conn.close() # 풀로 반환 - return results + try: + affected_rows = 0 + + # 여러 레코드 삽입인 경우 + if params and isinstance(params, list) and isinstance(params[0], tuple): + cursor.executemany(query, params) + affected_rows = cursor.rowcount + else: + cursor.execute(query, params) + affected_rows = cursor.rowcount + + # SELECT 처리 + if query.strip().lower().startswith("select"): + return cursor.fetchall() + else: + conn.commit() + return affected_rows + finally: + cursor.close() + conn.close() \ No newline at end of file diff --git a/summarization/sum_translate.py b/summarization/sum_translate.py index 54efe88..db9092c 100644 --- a/summarization/sum_translate.py +++ b/summarization/sum_translate.py @@ -9,31 +9,30 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) model = "gpt-4" -def summarize_translate_en_to_ko(text: str) -> str: +def translate_en_to_ko(text: str) -> str: """ - 영어 텍스트를 한국어로 번역하고 요약. + 영어 텍스트를 한국어로 번역. Args: text (str): 번역하고자 하는 원문(영어) 텍스트 Returns: - str: 요약된 한국어 번역 결과 + str: 한국어 번역 결과 """ prompt = f""" - Translate and summarize the following English text **into Korean** in **one or two sentences only**. - Focus on capturing the key message, and write naturally in Korean. + Translate the following English text **into Korean**. + Maintain the original tone and context as accurately as possible. - Text: {text} """ response = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": "You are a professional translator and summarizer."}, + {"role": "system", "content": "You are a professional translator."}, {"role": "user", "content": prompt} ], temperature=0.3, max_tokens=600 ) - return response.choices[0].message.content.strip() \ No newline at end of file + return response.choices[0].message.content.strip() From cd8dfe39601afdd6b52754a86ae9da1d24a33aff Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Sat, 31 May 2025 11:15:34 +0900 Subject: [PATCH 11/28] =?UTF-8?q?paddleocr=20=EB=8C=80=EC=8B=A0=20gemini?= =?UTF-8?q?=EB=A1=9C=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ocr작업을 paddleocr대신 gemini로 변경하여 작업에 걸리는 시간을 줄였습니다 --- ocr/__init__.py | 27 +++------ ocr/o.py | 123 +++++++++++++++++++--------------------- requirements.in | 7 +-- requirements.txt | 142 ++++++++--------------------------------------- 4 files changed, 91 insertions(+), 208 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 6426c86..1cf6402 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -2,7 +2,7 @@ from flasgger import Swagger, swag_from from server.logger import logger -from .o import download_image, extract_text, compare_texts +from .o import is_review_valid ocr_bp = Blueprint('ocr', __name__, url_prefix='/ocr') @@ -63,28 +63,17 @@ def evaluate_image(): data=request.get_json() - review_img_path=data.get("imageUrls") - award_img_path=data.get("awardImgUrl") - compare_text=data.get("title") + imageUrls=data.get("imageUrls") + awardImgUrl=data.get("awardImgUrl") + title=data.get("title") - print(review_img_path) - print(award_img_path) - print(compare_text) # OCR 실행 - if review_img_path: + if imageUrls: # ocr결과의 기본값은 False ocr_result = "False" - for img_url in review_img_path: - image_stream = download_image(img_url) - extracted_text = extract_text(image_stream) - print(extracted_text) - ocr_result = compare_texts(extracted_text, compare_text) - if ocr_result == "True": - break - if award_img_path != None: - award_image_stream = download_image(award_img_path) - award_text = extract_text(award_image_stream) - award_ocr_result = compare_texts(award_text, compare_text) + ocr_result=is_review_valid(title, imageUrls) + if awardImgUrl != None: + award_ocr_result=is_review_valid(title,awardImgUrl) else: award_ocr_result = "False" diff --git a/ocr/o.py b/ocr/o.py index ee89ee3..4dc288f 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -1,89 +1,82 @@ import torch import os -import numpy as np -import cv2 import requests +from google import genai +from google.genai import types from dotenv import load_dotenv from paddleocr import PaddleOCR from openai import OpenAI -from io import BytesIO # .env파일 로드 load_dotenv() -# 환경 변수에서 API 키 가져오기 -client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -model = "gpt-4" -ocr = PaddleOCR(lang="korean") +api_key = os.getenv("GEMINI_API_KEY") +MODEL_NAME = "gemini-1.5-flash" -def download_image(img_path): +def is_review_valid(title: str, image_urls: list[str]) -> bool: """ - s3에서 이미지 다운로드 후 바이트스트림에 저장 + 리뷰 제목과 이미지들을 기반으로 리뷰가 유효한지 판단합니다. Args: - img_path (str): s3상에 이미지 경로 + title (str): 리뷰 제목 + image_urls (list[str]): 이미지 URL 리스트 (최대 5개 권장) + Returns: - BytesIO: 이미지 데이터의 바이트스트림 객체 + bool: 리뷰가 유효하면 True, 그렇지 않으면 False """ + + image_parts = [] - presigned_url = img_path - print("presigned_url", presigned_url) - # 이미지 다운로드 (바이너리 형태) - response = requests.get(presigned_url) - -# 응답 확인 및 메모리에 저장 - if response.status_code == 200: - image_stream = BytesIO(response.content) - - return image_stream + # 각 이미지 URL을 순회하며 이미지 바이트를 가져와 types.Part 객체로 변환 + for url in image_urls[:5]: # 최대 5개 이미지만 처리 + if url: # URL이 비어있지 않은지 확인 + try: + response = requests.get(url, timeout=5) # 타임아웃 추가 + response.raise_for_status() # HTTP 오류 (4xx, 5xx) 발생 시 예외 발생 + + # MIME 타입 확인 (없으면 기본값 사용) + content_type = response.headers.get('Content-Type', 'image/jpeg') + + # 이미지 바이트를 types.Part 객체로 변환하고 리스트에 추가 + image_part = types.Part.from_bytes(data=response.content, mime_type=content_type) + image_parts.append(image_part) + except requests.exceptions.RequestException as e: + print(f"경고: 이미지를 가져오거나 처리하는 데 실패했습니다. URL: {url}, 오류: {e}") + continue + except Exception as e: + print(f"경고: 이미지 {url} 처리 중 예상치 못한 오류 발생: {e}") + continue + if not image_parts: + print("경고: 유효한 이미지를 찾거나 가져오지 못했습니다. False를 반환합니다.") + return False # 이미지가 없거나 모두 실패하면 유효하지 않다고 판단 -def extract_text(image_stream): - """ - BytesIO 객체의 이미지를 대상으로 OCR 수행 - - Args: - image_stream (BytesIO): 메모리에 저장된 이미지 데이터 - Returns: - list: OCR 결과 + prompt = f"""리뷰 제목: "{title}" + 리뷰의 제목과, 이미지들에 포함된 텍스트를 하나씩 비교합니다. + 하나라도 맞는 경우 문자열 True를 모두 아닐 경우 False를 반환합니다. """ - # 스트림을 numpy 배열로 변환 - image_stream.seek(0) # 읽기 위치 초기화 - file_bytes = np.frombuffer(image_stream.getvalue(), dtype=np.uint8) - img = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) # OpenCV를 사용하여 이미지 디코딩 - # OCR 수행 - ocr = PaddleOCR(lang='korean') # 언어 설정 가능 - results = ocr.ocr(img, cls=True) - return " ".join(text for result in results for _, (text, _) in result) - -def compare_texts(text1, text2): - """ - ocr로 추출한 텍스트와 활동 제목 간의 관계 분석 - Args: - text1 (str): 이미지에서 추출한 문자열 - text2 (str): 활동 제목에서의 문자열 - Returns: - str: 관련이 있다 판단 시 True / 없다 판단 시 False를 반환 - """ - prompt = f""" - Analyze the relationship between the following two texts. Determine whether they are conceptually or contextually related. - If they are related, return True; otherwise, return False without additional explanation + # 텍스트 프롬프트와 모든 이미지 파트를 contents 리스트로 결합 + contents = [prompt] + image_parts - Text 1: - {text1} - Text 2: - {text2} - """ + client = genai.Client(api_key=api_key) + try: + response = client.models.generate_content( + model=MODEL_NAME, + contents=contents, + config={ + # 응답형식을 True, False로 제한 + 'response_mime_type': 'text/x.enum', + 'response_schema': { + "type": "STRING", + "enum": ["True", "False"] + } + } + ) - response = client.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": "You are an objective analyst. Compare the following two texts and determine their relationship strictly based on content."}, - {"role": "user", "content": prompt} - ], - temperature=0, - max_tokens=600 - ) + print("responsetext: ",response.text) + return response.text.strip() == "True" - return response.choices[0].message.content.strip() \ No newline at end of file + except Exception as e: + print(f"API 호출 실패: {e}") + return False # API 호출 실패 시 유효하지 않다고 판단 \ No newline at end of file diff --git a/requirements.in b/requirements.in index d663ace..80f725d 100644 --- a/requirements.in +++ b/requirements.in @@ -14,12 +14,7 @@ scikit-learn # Generative AI openai - -# OCR -paddleocr -paddlepaddle -# gpu 사용할 경우 해당 패키지 포함 -#paddlepaddle-gpu==2.5.0.post118 +google-genai>=0.1.0 # MySQL mysql-connector-python diff --git a/requirements.txt b/requirements.txt index ee748ff..b02b51a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,34 +4,21 @@ # # pip-compile requirements.in # -albucore==0.0.24 - # via - # albumentations - # paddleocr -albumentations==2.0.6 - # via paddleocr annotated-types==0.7.0 # via pydantic anyio==4.9.0 # via + # google-genai # httpx # openai -astor==0.8.1 - # via paddlepaddle attrs==25.3.0 # via # jsonschema # referencing -beautifulsoup4==4.13.4 - # via paddleocr blinker==1.9.0 # via flask -boto3==1.38.23 - # via -r requirements.in -botocore==1.38.23 - # via - # boto3 - # s3transfer +cachetools==5.5.2 + # via google-auth certifi==2025.1.31 # via # httpcore @@ -45,10 +32,6 @@ colorama==0.4.6 # via # click # tqdm -cython==3.1.0 - # via paddleocr -decorator==5.2.1 - # via paddlepaddle distro==1.9.0 # via openai filelock==3.18.0 @@ -56,8 +39,6 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fire==0.7.0 - # via paddleocr flasgger==0.9.7.1 # via -r requirements.in flask==3.1.0 @@ -67,20 +48,22 @@ flask==3.1.0 # flask-cors flask-cors==5.0.1 # via -r requirements.in -fonttools==4.58.0 - # via paddleocr fsspec==2025.3.2 # via # huggingface-hub # torch +google-auth==2.40.2 + # via google-genai +google-genai==1.18.0 + # via -r requirements.in h11==0.16.0 # via httpcore httpcore==1.0.9 # via httpx httpx==0.28.1 # via + # google-genai # openai - # paddlepaddle huggingface-hub==0.31.1 # via # tokenizers @@ -90,8 +73,6 @@ idna==3.10 # anyio # httpx # requests -imageio==2.37.0 - # via scikit-image itsdangerous==2.2.0 # via flask jinja2==3.1.6 @@ -100,22 +81,12 @@ jinja2==3.1.6 # torch jiter==0.9.0 # via openai -jmespath==1.0.1 - # via - # boto3 - # botocore joblib==1.5.0 # via scikit-learn jsonschema==4.23.0 # via flasgger jsonschema-specifications==2025.4.1 # via jsonschema -lazy-loader==0.4 - # via scikit-image -lmdb==1.6.2 - # via paddleocr -lxml==5.4.0 - # via python-docx markupsafe==3.0.2 # via # jinja2 @@ -127,81 +98,38 @@ mpmath==1.3.0 mysql-connector-python==9.3.0 # via -r requirements.in networkx==3.4.2 - # via - # paddlepaddle - # scikit-image - # torch + # via torch numpy==2.2.5 # via - # albucore - # albumentations - # imageio - # opencv-contrib-python - # opencv-python - # opencv-python-headless - # opt-einsum - # paddleocr - # paddlepaddle - # scikit-image # scikit-learn # scipy - # shapely - # tifffile # transformers openai==1.78.1 # via -r requirements.in -opencv-contrib-python==4.11.0.86 - # via paddleocr -opencv-python==4.11.0.86 - # via paddleocr -opencv-python-headless==4.11.0.86 - # via - # albucore - # albumentations -opt-einsum==3.3.0 - # via paddlepaddle packaging==25.0 # via # flasgger # huggingface-hub - # lazy-loader - # scikit-image # transformers -paddleocr==2.10.0 - # via -r requirements.in -paddlepaddle==3.0.0 - # via -r requirements.in -pillow==11.2.1 +pyasn1==0.6.1 # via - # imageio - # paddleocr - # paddlepaddle - # scikit-image -protobuf==6.31.0 - # via paddlepaddle -pyclipper==1.3.0.post6 - # via paddleocr + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth pydantic==2.11.4 # via - # albumentations + # google-genai # openai pydantic-core==2.33.2 # via pydantic -python-dateutil==2.9.0.post0 - # via botocore -python-docx==1.1.2 - # via paddleocr python-dotenv==1.1.0 # via -r requirements.in pyyaml==6.0.2 # via - # albumentations # flasgger # huggingface-hub - # paddleocr # transformers -rapidfuzz==3.13.0 - # via paddleocr referencing==0.36.2 # via # jsonschema @@ -211,50 +139,31 @@ regex==2024.11.6 requests==2.32.3 # via # -r requirements.in + # google-genai # huggingface-hub - # paddleocr # transformers rpds-py==0.24.0 # via # jsonschema # referencing -s3transfer==0.13.0 - # via boto3 +rsa==4.9.1 + # via google-auth safetensors==0.5.3 # via transformers -scikit-image==0.25.2 - # via paddleocr scikit-learn==1.6.1 # via -r requirements.in scipy==1.15.3 - # via - # albumentations - # scikit-image - # scikit-learn -shapely==2.1.0 - # via paddleocr -simsimd==6.2.1 - # via albucore + # via scikit-learn six==1.17.0 - # via - # flasgger - # python-dateutil + # via flasgger sniffio==1.3.1 # via # anyio # openai -soupsieve==2.7 - # via beautifulsoup4 -stringzilla==3.12.5 - # via albucore sympy==1.14.0 # via torch -termcolor==3.1.0 - # via fire threadpoolctl==3.6.0 # via scikit-learn -tifffile==2025.5.10 - # via scikit-image tokenizers==0.21.1 # via transformers torch==2.7.0 @@ -263,29 +172,26 @@ tqdm==4.67.1 # via # huggingface-hub # openai - # paddleocr # transformers transformers==4.51.3 # via -r requirements.in typing-extensions==4.13.2 # via # anyio - # beautifulsoup4 + # google-genai # huggingface-hub # openai - # paddlepaddle # pydantic # pydantic-core - # python-docx # referencing # torch # typing-inspection typing-inspection==0.4.0 # via pydantic urllib3==2.3.0 - # via - # botocore - # requests + # via requests +websockets==15.0.1 + # via google-genai werkzeug==3.1.3 # via # flask From abca923a3308e88280b450f408b65c7be411de1b Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Sun, 1 Jun 2025 11:44:11 +0900 Subject: [PATCH 12/28] =?UTF-8?q?api=EC=98=A4=EB=A5=98=20=EC=88=98?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit awardImgUrl->awardImageUrl로 받아오도록 수정 --- ocr/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 1cf6402..2f7b449 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -18,12 +18,12 @@ 'schema': { 'type': 'object', 'properties': { - 'image_urls': { + 'imageUrls': { 'type': 'array', 'items': {'type': 'string'}, 'description': '검토할 이미지 URL 리스트' }, - 'award_img_urls': { + 'awardImageUrl': { 'type': 'string', 'description': '수상 이미지의 URL' }, @@ -64,7 +64,7 @@ def evaluate_image(): data=request.get_json() imageUrls=data.get("imageUrls") - awardImgUrl=data.get("awardImgUrl") + awardImageUrl=data.get("awardImageUrl") title=data.get("title") # OCR 실행 @@ -72,8 +72,8 @@ def evaluate_image(): # ocr결과의 기본값은 False ocr_result = "False" ocr_result=is_review_valid(title, imageUrls) - if awardImgUrl != None: - award_ocr_result=is_review_valid(title,awardImgUrl) + if awardImageUrl != None: + award_ocr_result=is_review_valid(title,awardImageUrl) else: award_ocr_result = "False" From 217814838e769e807c65708d35d6c218cecc925c Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Sun, 1 Jun 2025 13:33:14 +0900 Subject: [PATCH 13/28] =?UTF-8?q?ocr=EC=98=A4=EB=A5=98=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit awardocrimage가 잘못 인식되어 사진 체크가 되지 않음을 수정 --- ocr/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index 2f7b449..e9875a8 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -71,9 +71,11 @@ def evaluate_image(): if imageUrls: # ocr결과의 기본값은 False ocr_result = "False" - ocr_result=is_review_valid(title, imageUrls) + ocr_result=is_review_valid(title, imageUrls) if awardImageUrl != None: - award_ocr_result=is_review_valid(title,awardImageUrl) + awardImageUrlList=[awardImageUrl] + print("awardImgUrl",awardImageUrlList) + award_ocr_result=is_review_valid(title,awardImageUrlList) else: award_ocr_result = "False" From 32803bb44b8d99b950af5d86bf229cb72280413b Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:18:22 +0900 Subject: [PATCH 14/28] =?UTF-8?q?=EA=B9=83=ED=97=88=EB=B8=8C=20=EC=95=A1?= =?UTF-8?q?=EC=85=98=EC=9D=84=20=EC=9D=B4=EC=9A=A9=ED=95=9C=20ci/cd?= =?UTF-8?q?=EA=B5=AC=EC=B6=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.서버에 접속해서 git pull origin main실행 2.패키지 업데이트 및 설치 3.애플리케이션 실행 위의 과정이 순차로 이루어집니다 --- .github/workflows/deploy.yml | 19 +++++++++++++++++++ deploy_script.sh | 10 ++++++++++ 2 files changed, 29 insertions(+) create mode 100644 .github/workflows/deploy.yml create mode 100644 deploy_script.sh diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..d39308b --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,19 @@ +name: Deploy with GitHub Actions + +on: + push: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Setup SSH Key + run: | + echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem + chmod 600 id_rsa.pem + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd HIGHFIVE-AI/ && git pull origin main && ./deploy_script.sh" \ No newline at end of file diff --git a/deploy_script.sh b/deploy_script.sh new file mode 100644 index 0000000..a113942 --- /dev/null +++ b/deploy_script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +cd HIGHFIVE-AI/ || exit + +# 패키지 업데이트 및 설치 +pip-compile requirements.in +pip install -r requirements.txt + +# 애플리케이션 실행 +python app.py \ No newline at end of file From 5704ad8f5b107800d23d5c052f6edae861dc0afb Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:22:06 +0900 Subject: [PATCH 15/28] =?UTF-8?q?ci/cd=ED=85=8C=EC=8A=A4=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index e1ba0c4..8ec10be 100644 --- a/app.py +++ b/app.py @@ -6,7 +6,7 @@ from flasgger import Swagger from server.logger import logger - +#test # 현재 app.py 파일의 디렉토리 경로를 sys.path에 추가 current_dir = os.path.dirname(os.path.abspath(__file__)) if current_dir not in sys.path: From 14de5314fac912acf3a97f3c221de332b81872ab Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:25:15 +0900 Subject: [PATCH 16/28] =?UTF-8?q?ci/cd=ED=85=8C=EC=8A=A4=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocr/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index e9875a8..aa3b6f0 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -74,7 +74,6 @@ def evaluate_image(): ocr_result=is_review_valid(title, imageUrls) if awardImageUrl != None: awardImageUrlList=[awardImageUrl] - print("awardImgUrl",awardImageUrlList) award_ocr_result=is_review_valid(title,awardImageUrlList) else: award_ocr_result = "False" From 7c775e041e2827f723344914630d03968b2db76f Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:36:51 +0900 Subject: [PATCH 17/28] =?UTF-8?q?ci/cd=ED=85=8C=EC=8A=A4=ED=8A=B82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocr/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocr/__init__.py b/ocr/__init__.py index aa3b6f0..afae7a1 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -73,6 +73,7 @@ def evaluate_image(): ocr_result = "False" ocr_result=is_review_valid(title, imageUrls) if awardImageUrl != None: + print(awardImageUrl) awardImageUrlList=[awardImageUrl] award_ocr_result=is_review_valid(title,awardImageUrlList) else: From 3584e1bd98bc260d09f6d96882b72f2611a43c0e Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:41:52 +0900 Subject: [PATCH 18/28] =?UTF-8?q?ci/cd=EC=98=A4=EB=A5=98=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 2 +- deploy_script.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d39308b..279ce4a 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -16,4 +16,4 @@ jobs: run: | echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem chmod 600 id_rsa.pem - ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd HIGHFIVE-AI/ && git pull origin main && ./deploy_script.sh" \ No newline at end of file + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git pull origin main && ./deploy_script.sh" \ No newline at end of file diff --git a/deploy_script.sh b/deploy_script.sh index a113942..ee11dba 100644 --- a/deploy_script.sh +++ b/deploy_script.sh @@ -1,6 +1,6 @@ #!/bin/bash -cd HIGHFIVE-AI/ || exit +cd ~/HIGHFIVE-AI/ || exit # 패키지 업데이트 및 설치 pip-compile requirements.in From 57525217ec464864f438695bee68555eff258f86 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:48:47 +0900 Subject: [PATCH 19/28] =?UTF-8?q?ci/cd=ED=85=8C=EC=8A=A4=ED=8A=B83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocr/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocr/__init__.py b/ocr/__init__.py index afae7a1..aa3b6f0 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -73,7 +73,6 @@ def evaluate_image(): ocr_result = "False" ocr_result=is_review_valid(title, imageUrls) if awardImageUrl != None: - print(awardImageUrl) awardImageUrlList=[awardImageUrl] award_ocr_result=is_review_valid(title,awardImageUrlList) else: From 1135d6cb5209fa6f8563f007882b9c6fea165a23 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 13:54:07 +0900 Subject: [PATCH 20/28] =?UTF-8?q?ci/cd=EC=98=A4=EB=A5=98=EC=88=98=EC=A0=95?= =?UTF-8?q?2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 279ce4a..6b802c5 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -11,6 +11,10 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 + + - name: Set execute permissions for deploy script + run: chmod +x deploy_script.sh + - name: Setup SSH Key run: | From 7cbc655edb887a477bff12305f8ae4549bb1e46e Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:03:12 +0900 Subject: [PATCH 21/28] =?UTF-8?q?cicd=ED=85=8C=EC=8A=A4=ED=8A=B84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deploy_script.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy_script.sh b/deploy_script.sh index ee11dba..5a6df14 100644 --- a/deploy_script.sh +++ b/deploy_script.sh @@ -7,4 +7,5 @@ pip-compile requirements.in pip install -r requirements.txt # 애플리케이션 실행 -python app.py \ No newline at end of file +screen -dmS cicd python app.py +echo "Flask is running in a screen session." \ No newline at end of file From 5314425291be163374647f0b5ed51b8eb5a3f91d Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:06:25 +0900 Subject: [PATCH 22/28] =?UTF-8?q?cdcd=EC=88=98=EC=A0=953?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 6b802c5..b353452 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - + - name: Set execute permissions for deploy script run: chmod +x deploy_script.sh @@ -20,4 +20,4 @@ jobs: run: | echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem chmod 600 id_rsa.pem - ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git pull origin main && ./deploy_script.sh" \ No newline at end of file + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && pull origin main && ./deploy_script.sh" \ No newline at end of file From 3053e4b3e4076ac6f7842922abea1261d2276a7b Mon Sep 17 00:00:00 2001 From: seominjae1 <153708875+seominjae1@users.noreply.github.com> Date: Mon, 2 Jun 2025 14:08:23 +0900 Subject: [PATCH 23/28] Update deploy.yml --- .github/workflows/deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index b353452..2205596 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -20,4 +20,4 @@ jobs: run: | echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem chmod 600 id_rsa.pem - ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && pull origin main && ./deploy_script.sh" \ No newline at end of file + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && ./deploy_script.sh" From d13319f2a68e2602dafd2eb2628a934151b9c535 Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:16:09 +0900 Subject: [PATCH 24/28] =?UTF-8?q?cicd=EC=98=A4=EB=A5=98=EC=88=98=EC=A0=954?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index b353452..764d10c 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -13,11 +13,13 @@ jobs: uses: actions/checkout@v3 - name: Set execute permissions for deploy script - run: chmod +x deploy_script.sh + run: | + cd ~/HIGHFIVE-AI/ + chmod +x deploy_script.sh - name: Setup SSH Key run: | echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem chmod 600 id_rsa.pem - ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && pull origin main && ./deploy_script.sh" \ No newline at end of file + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && ./deploy_script.sh" \ No newline at end of file From 846ac013693e8d29f92e959e74c88115bfb00aff Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:22:00 +0900 Subject: [PATCH 25/28] =?UTF-8?q?cicd=EC=98=A4=EB=A5=98=EC=88=98=EC=A0=955?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 764d10c..ee26687 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -13,9 +13,7 @@ jobs: uses: actions/checkout@v3 - name: Set execute permissions for deploy script - run: | - cd ~/HIGHFIVE-AI/ - chmod +x deploy_script.sh + run: chmod +x ${{ github.workspace }}/deploy_script.sh - name: Setup SSH Key From 26bd37b637f53652e543a634204514716e47ee1e Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:32:52 +0900 Subject: [PATCH 26/28] =?UTF-8?q?cicd=EC=88=98=EC=A0=956?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 642dc73..befd977 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -12,8 +12,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Check workspace directory + run: echo "current workspace${{ github.workspace }}" + - name: Set execute permissions for deploy script - run: chmod +x ${{ github.workspace }}/deploy_script.sh + run: chmod +x ${{ github.workspace }}/HIGHFIVE-AI/deploy_script.sh - name: Setup SSH Key From 8d1f93bfd23c0277a2276000c709337d1586e31a Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 14:40:36 +0900 Subject: [PATCH 27/28] =?UTF-8?q?cdcd=EC=88=98=EC=A0=957?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index befd977..e29498f 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -12,11 +12,8 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - - name: Check workspace directory - run: echo "current workspace${{ github.workspace }}" - - name: Set execute permissions for deploy script - run: chmod +x ${{ github.workspace }}/HIGHFIVE-AI/deploy_script.sh + run: chmod +x ${{ github.workspace }}/deploy_script.sh - name: Setup SSH Key @@ -24,4 +21,4 @@ jobs: echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem chmod 600 id_rsa.pem - ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && ./deploy_script.sh" \ No newline at end of file + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && chmod +x deploy_script.sh && ./deploy_script.sh" \ No newline at end of file From c28196c1a86604f3975dab2a1d37b9871d8edf8f Mon Sep 17 00:00:00 2001 From: seominjae1 Date: Mon, 2 Jun 2025 15:00:26 +0900 Subject: [PATCH 28/28] =?UTF-8?q?cicd=EC=88=98=EC=A0=958?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deploy_script.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deploy_script.sh b/deploy_script.sh index 5a6df14..bc0ae1d 100644 --- a/deploy_script.sh +++ b/deploy_script.sh @@ -6,6 +6,7 @@ cd ~/HIGHFIVE-AI/ || exit pip-compile requirements.in pip install -r requirements.txt -# 애플리케이션 실행 -screen -dmS cicd python app.py +# 기존의 스크린 삭제 후 재실행 +screen -S flask-server -X quit +screen -dmS flask-server bash -c "cd ~/HIGHFIVE-AI && python app.py" echo "Flask is running in a screen session." \ No newline at end of file