diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..e29498f --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,24 @@ +name: Deploy with GitHub Actions + +on: + push: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set execute permissions for deploy script + run: chmod +x ${{ github.workspace }}/deploy_script.sh + + + - name: Setup SSH Key + run: | + echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem + chmod 600 id_rsa.pem + + ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && chmod +x deploy_script.sh && ./deploy_script.sh" \ No newline at end of file diff --git a/api_request/reliefweb.py b/api_request/reliefweb.py deleted file mode 100644 index 1bcbd7f..0000000 --- a/api_request/reliefweb.py +++ /dev/null @@ -1,56 +0,0 @@ -import requests -import json - -# API 엔드포인트 -api_url = "https://api.reliefweb.int/v1/jobs?limit=10&offset=1120" -# API링크를 저장하기 위한 배열 -description_endpoint=[] -# API요청을 보내 -while api_url: # 다음으로 참고할 데이터가 없을 경우 조건문 종료 - response = requests.get(api_url) - if response.status_code==200: - data=response.json() - - links = data.get('links', {}) - if links: - next_link = links.get('next', None) - api_url = next_link.get('href', None) if next_link else None - else: - api_url = None # 'next'가 없으면 종료 조건으로 설정 - - - print(api_url) # 디버깅용 - jobs=data.get("data", []) - description_endpoint.append([job['href'] for job in jobs]) - - # for job in jobs: - # href = job.get("href", "No Link") - - # description_endpoint.append({"href": href}) - - - else: - print("API 요청 실패:", response.status_code, response.text) - break - -# print(description_endpoint) - -job_list=[] -flattened_data = [item for sublist in description_endpoint for item in sublist] - -# print(flattened_data) - -for info in flattened_data: - - response=requests.get(info) - - if response.status_code==200: - data=response.json() - jobs=data.get("data", []) - for job in jobs: - fields=job.get("fields", {}) - title=fields.get("title", "No title") - body=fields.get("body", "No body") - job_list.append({"title": title, "body": body}) - -print(json.dumps(job_list, indent=4, ensure_ascii=False)) \ No newline at end of file diff --git a/app.py b/app.py index b7ca66e..8ec10be 100644 --- a/app.py +++ b/app.py @@ -3,9 +3,10 @@ from flask import Flask from flask_cors import CORS from dotenv import load_dotenv +from flasgger import Swagger from server.logger import logger - +#test # 현재 app.py 파일의 디렉토리 경로를 sys.path에 추가 current_dir = os.path.dirname(os.path.abspath(__file__)) if current_dir not in sys.path: @@ -23,6 +24,8 @@ app = Flask(__name__) CORS(app, resources={r"/*": {"origins": "*"}}) +swagger=Swagger(app) + # 모든 Blueprint 등록 from chat import chat_bp app.register_blueprint(chat_bp) diff --git a/crawler/bbc_crawler.py b/crawler/bbc_crawler.py new file mode 100644 index 0000000..f254e29 --- /dev/null +++ b/crawler/bbc_crawler.py @@ -0,0 +1,128 @@ +import requests +from crawler.keyword_extractor import extract_keyword +from summarization.sum_translate import translate_en_to_ko +from crawler.save_to_db import save_issues +from bs4 import BeautifulSoup +from datetime import datetime +from server.db import run_query + +BASE_URL = 'https://web-cdn.api.bbci.co.uk/xd/content-collection/' +COLLECTIONS = { + 'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66', + 'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9', + 'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b', +} +HEADERS = { + 'User-Agent': 'Mozilla/5.0' +} +SIZE = 9 + +def get_last_issue_date(): + sql = """ + SELECT MAX(issue_date) + FROM issues; + """ + result = run_query(sql) + + if result and result[0][0]: + dt = result[0][0] + latest_issue_date = dt.strftime("%Y-%m-%d %H:%M:%S.%f") + return latest_issue_date + else: + return None + +def is_end(date, end_time): + date_dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f") + end_time_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f") + return date_dt <= end_time_dt + +def get_datetime(time): + dt = datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ") + return dt.strftime("%Y-%m-%d %H:%M:%S.%f") + +def get_content(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + content_divs = soup.find_all('div', attrs={'data-component': 'text-block'}) + contents = [div.get_text(strip=True) for div in content_divs] + full_content = '\n'.join(contents) if contents else "No Content" + + return full_content + +def get_articles(page, collection_id, end_time): + params = { + 'page': page, + 'size': SIZE, + } + + response = requests.get(BASE_URL + collection_id, params=params, headers=HEADERS) + + if not response: + return [] + + datas = response.json().get('data') + articles = [] + + for data in datas: + date = get_datetime(data['firstPublishedAt']) + + if end_time: + if is_end(date, end_time): + break + + title = translate_en_to_ko(data['title']) + keyword = extract_keyword(data['summary']) + summary = translate_en_to_ko(data['summary']) + url = "https://www.bbc.com" + data['path'] + image = data['indexImage']['model']['blocks']['src'] or None + + articles.append( + { + 'content': summary, + 'image_url': image, + 'issue_date': date, + 'keyword': keyword, + 'site_url': url, + 'title': title, + } + ) + print(f"[BBC] 크롤링 완료 : {title}") + + return articles + +def crawl(): + print("[BBC] 크롤링 시작") + results = [] + last_issue_date = get_last_issue_date() + + if last_issue_date: + print(f"[BBC] DB의 마지막 이슈 이후 데이터만 크롤링 시작 (DATE : {last_issue_date})") + else: + print(f"[BBC] DB에 이슈 없음, 모든 데이터 크롤링 시작") + + for category, collection_id in COLLECTIONS.items(): + # print(f"[BBC] 카테고리 {category} :") + page = 0 + + while True: + articles = get_articles(page, collection_id, last_issue_date) + + if not articles: + break + + results.extend(articles) + page += 1 + + if results: + print(f"[BBC] 크롤링 완료 : {len(results)}개의 이슈를 크롤링했습니다.") + save_issues(results) + else: + print("[BBC] 크롤링 완료 : 새로운 이슈가 없습니다.") + + + +def main(): + crawl() + +if __name__ == '__main__': + main() diff --git a/crawler/idealist_crawler.py b/crawler/idealist_crawler.py new file mode 100644 index 0000000..7313333 --- /dev/null +++ b/crawler/idealist_crawler.py @@ -0,0 +1,146 @@ +import requests +import json +from datetime import datetime, timedelta, timezone +from crawler.keyword_extractor import extract_keyword +from crawler.save_to_db import save_activities +from server.db import run_query + +ENDPOINT = "https://nsv3auess7-dsn.algolia.net/1/indexes/*/queries" +HEADERS = { + "Content-Type": "application/json", + "x-algolia-agent": "Algolia for JavaScript (5.20.0); Search (5.20.0); Browser", + "x-algolia-api-key": "c2730ea10ab82787f2f3cc961e8c1e06", + "x-algolia-application-id": "NSV3AUESS7" +} +DEFAULT_IMAGE_URL = "https://www.idealist.org/assets/417d88fd628db1c1ac861f3ea8db58c1a159d52a/images/icons/action-opps/action-opps-volunteermatch.svg" + +def get_last_timestamp(): + sql = """ + SELECT start_date + FROM activities + WHERE activity_site = 'IDEALIST' + ORDER BY start_date DESC + LIMIT 1; + """ + last_timestamp = run_query(sql) + + if last_timestamp: + dt = last_timestamp[0][0].replace(tzinfo=timezone.utc) + return int(dt.timestamp()) + else: + return 0 + +def build_payload(page, type='volunteer', timestamp=0): + if type == 'volunteer': + filters = f"actionType:'VOLOP' AND published > {timestamp}" + index_name = "idealist7-production-action-opps" + else: + filters = f"type:'INTERNSHIP' AND published > {timestamp}" + index_name = "idealist7-production" + + return { + "requests": [ + { + "indexName": index_name, + "facets": ["*"], + "hitsPerPage": 100, + "attributesToSnippet": ["description:20"], + "attributesToRetrieve": ["*"], + "filters": filters, + "removeStopWords": True, + "ignorePlurals": True, + "advancedSyntax": True, + "queryLanguages": ["en"], + "page": page, + "query": "", + "getRankingInfo": True, + "clickAnalytics": True, + "analytics": True + } + ] + } + +def get_url(item): + url = item.get("url") + if isinstance(url, str): + return url + elif isinstance(url, dict): + return "https://www.idealist.org" + next(iter(url.values()), "") + return "" + +def get_image(item): + img = item.get("imageUrl") or DEFAULT_IMAGE_URL + return img + +def get_published(item): + timestamp = item.get("published") + return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f') + +def get_activities(page, timestamp, type): + payload = build_payload(page, type, timestamp) + response = requests.post(ENDPOINT, headers=HEADERS, json=payload) + + try: + data = response.json()["results"][0]["hits"] + except Exception as e: + print(f"[!] JSON 파싱 에러: {e}") + return None + + result = [] + + if data: + for item in data: + activity_type = "VOLUNTEER" if type=='volunteer' else 'INTERNSHIP' + activity_content = item.get("description") + activity_name = item.get("name") + activity_image_url = get_image(item) + activity_url = get_url(item) + start_date = get_published(item) + end_date = None + keyword = extract_keyword(activity_content) + + result.append( + { + "activity_site": "IDEALIST", + "activity_type": activity_type, + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": activity_image_url, + "keyword": keyword, + "activity_name": activity_name, + "site_url": activity_url, + "start_date": start_date + } + ) + print(f"[IDEALIST] 크롤링 완료 : {item.get("name", '')}") + return result + else: + return None + +def crawl(): + print("[IDEALIST] 크롤링 시작") + crawled_activities = [] + last_timestamp = get_last_timestamp() + + if last_timestamp > 0: + print(f"[IDEALIST] DB의 마지막 활동 이후 데이터만 크롤링 시작 (TIMESTAMP: {last_timestamp})") + else: + print(f"[IDEALIST] DB에 활동 없음, 모든 데이터 크롤링 시작") + + for type in ['volunteer', 'internship']: + page = 0 + while True: + activities = get_activities(page, last_timestamp, type) + if not activities: + break + crawled_activities.extend(activities) + page += 1 + + if crawled_activities: + print(f"[IDEALIST] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.") + save_activities(crawled_activities) + else: + print("[IDEALIST] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() diff --git a/crawler/keyword_extractor.py b/crawler/keyword_extractor.py new file mode 100644 index 0000000..8d817a1 --- /dev/null +++ b/crawler/keyword_extractor.py @@ -0,0 +1,84 @@ +import requests +import os +from dotenv import load_dotenv + +# .env 파일에서 환경변수 로드 +load_dotenv() + +# 키워드 후보 +KEYWORDS = ['Economy','Environment','PeopleAndSociety','Technology'] +MODEL = 'gemini-2.0-flash-lite' + +def extract_keyword(text: str) -> str: + """ + 봉사활동 내용을 입력받아 적절한 키워드를 반환합니다. + + Parameters: + text (str): 봉사활동 내용 + + Returns: + str: 봉사활동 내용에 맞는 키워드 + """ + + # Gemini API 키 가져오기 + api_key = os.getenv('GEMINI_API_KEY') + if not api_key: + raise ValueError("GEMINI_API_KEY 환경변수가 설정되지 않았습니다.") + + # API 엔드포인트 URL + url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={api_key}" + + # 프롬프트 작성 + prompt = f""" +Read the following volunteer activity description and choose the **most appropriate keyword** from the provided list. + +Only output **one keyword**, exactly as it appears in the list. Do not add any extra words or punctuation. + +Volunteer Description: +{text} + +Keyword List: +{', '.join(KEYWORDS)} +""" + + # API 요청 데이터 준비 + payload = { + "contents": [{ + "parts": [{ + "text": prompt + }] + }] + } + + # API 호출 + headers = {'Content-Type': 'application/json'} + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() # HTTP 에러 체크 + + # 응답 파싱 + result = response.json() + if 'candidates' in result and len(result['candidates']) > 0: + generated_text = result['candidates'][0]['content']['parts'][0]['text'] + # 응답에서 키워드만 추출 (앞뒤 공백 제거) + keyword = generated_text.strip() + + # 추출된 키워드가 후보 목록에 있는지 확인 + if keyword in KEYWORDS: + return keyword + else: + return KEYWORDS[0] # 기본값으로 첫 번째 키워드 반환 + + except Exception as e: + print(f"API 호출 중 오류 발생: {e}") + return KEYWORDS[0] # 오류 발생 시 기본값으로 첫 번째 키워드 반환 + + return KEYWORDS[0] # 기본값으로 첫 번째 키워드 반환 + +if __name__ == "__main__": + # 테스트용 예시 + text = """ + 'Are you passionate about creating a positive change in society? Our CBS-featured non-profit wants you to join us in making a difference. About Us: Bright Mind is an award-winning non-profit organization recognized for our innovative initiatives such as Wellness Week and Street Care. Our outreach has reached up to 60 million people and has been featured on CBS, Politico, ABC, and Newsweek. We are looking for a passionate and versatile volunteer to join our team. If you have a desire to make a positive impact in the lives of those experiencing homelessness, we would love to hear from you! Position Overview ● Bright Mind is seeking dedicated and compassionate individuals to join our Street Care team as Homelessness Volunteers. ● In this role, you will have the opportunity to make a tangible difference in the lives of those experiencing homelessness. ● You will work closely with our Community Outreach team to provide support, resources, and advocacy for homeless individuals and families. ● We have decades of experience providing aid to homeless and highly at risk people, and our program always places safety first. ● We have a variety of openings, whether you’re interested in going out on the street or looking to help in other ways. Key Responsibilities ● Direct Support: ○ Engage with homeless individuals and families to assess their needs and provide appropriate support. ○ Distribute essential items such as food, clothing, hygiene products, and blankets. ● Resource Connection: ○ Connect individuals with local services, including housing, medical care, job training, and mental health support. ○ Provide information about available resources and help individuals navigate the social services system. ● Advocacy and Education: ○ Participate in community education programs to inform the public about homelessness issues and how they can help. ○ Work with local businesses and organizations to secure support (notably in-kind, such as food and clothing) and collaborate on our homeless initiatives. ● Event Coordination: ○ Assist in organizing and executing events such as donation drives, community meals, and health fairs. ○ Support the planning and logistics of outreach activities and special programs. ● Data Collection and Reporting: ○ Maintain accurate records of interactions and services provided to homeless individuals. ○ Assist with data collection and reporting to help track the impact of Bright Mind’s homelessness programs. Qualifications ● Skills and Competencies: ○ Strong interpersonal and communication skills. ○ Empathy, patience, and a non-judgmental attitude towards individuals experiencing homelessness. ○ Ability to work independently and as part of a team. ○ Flexibility and adaptability in a dynamic work environment. ○ Basic knowledge of social services and resources available for homeless individuals (preferred but not required). ● Experience: ○ Previous volunteer experience, especially in community outreach or working with vulnerable populations, is preferred but not required. ○ Experience in event coordination, advocacy, or data collection is a plus. ● Education: ○ Relevant coursework or training in social work, psychology, or a related field is welcomed. Benefits ● Opportunity to make a meaningful impact in the community. ● Hands-on experience in community outreach and social services. ● Professional development and training opportunities. ● Flexible volunteer schedules to accommodate your availability. Note: This is an unpaid position. Contact Us Please reach out to us at info@brightmindenrichment.org. To apply for this position, email your resume to hr@brightmindenrichment.org. Learn more about our initiatives at Street Care (https://streetcare.us/) and Bright Mind (https://brightmindenrichment.org/). Bright Mind is a federally-recognized 501(c)(3) wellness education non-profit and recipient of awards and certifications in recognition of our achievements.' + """ + keyword = extract_keyword(text) + print(f"선택된 키워드: {keyword}") \ No newline at end of file diff --git a/crawler/main_crawler.py b/crawler/main_crawler.py new file mode 100644 index 0000000..8fc4910 --- /dev/null +++ b/crawler/main_crawler.py @@ -0,0 +1,23 @@ +from crawler.bbc_crawler import crawl as bbc_crawler +from crawler.wevity_crawler import crawl as wevity_crawler +from crawler.idealist_crawler import crawl as idealist_crawler +from crawler.unv_crawler import crawl as unv_crawler +from crawler.v1365_crawler import crawl as v1365_crawler + +if __name__ == "__main__": + # BBC News + # bbc_crawler() + + # WEVITY + wevity_crawler() + + # 1365 + v1365_crawler() + + # IDEALIST + # idealist_crawler() + + # UNVOLUNTEERS + # unv_crawler() + + \ No newline at end of file diff --git a/crawler/save_to_db.py b/crawler/save_to_db.py new file mode 100644 index 0000000..94ea4cc --- /dev/null +++ b/crawler/save_to_db.py @@ -0,0 +1,84 @@ +from server.db import run_query +import uuid + +def save_issues(issues): + if not issues: + print("[DB] 저장할 이슈가 없습니다.") + return + + print("[DB] 크롤링한 이슈 DB 저장 중...") + + sql = """ + INSERT IGNORE INTO issues ( + issue_id, + created_at, + content, + image_url, + issue_date, + keyword, + site_url, + title + ) VALUES (%s, UTC_TIMESTAMP(6), %s, %s, %s, %s, %s, %s) + """ + + values = [ + ( + uuid.uuid4().bytes, + issue['content'], + issue['image_url'], + issue['issue_date'], + issue['keyword'], + issue['site_url'], + issue['title'] + ) + for issue in issues + ] + + if values: + saved_rows = run_query(sql, values) + print(f"[DB] {saved_rows}개의 이슈가 저장되었습니다.") + +def save_activities(activities): + if not activities: + print("[DB] 저장할 활동이 없습니다.") + return + + print("[DB] 크롤링한 활동 DB 저장 중...") + + sql = """ + INSERT IGNORE INTO activities ( + created_at, + end_date, + start_date, + activity_id, + activity_image_url, + activity_name, + site_url, + activity_content, + activity_site, + activity_type, + keyword + ) VALUES (UTC_TIMESTAMP(6), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + + values = [ + ( + activity['end_date'], + activity['start_date'], + uuid.uuid4().bytes, + activity['activity_image_url'], + activity['activity_name'], + activity['site_url'], + activity['activity_content'], + activity['activity_site'], + activity['activity_type'], + activity['keyword'] + ) + for activity in activities + ] + + if values: + saved_rows = run_query(sql, values) + print(f"[DB] {saved_rows}개의 활동이 저장되었습니다.") + + diff --git a/crawler/unv_crawler.py b/crawler/unv_crawler.py new file mode 100644 index 0000000..beea769 --- /dev/null +++ b/crawler/unv_crawler.py @@ -0,0 +1,112 @@ +import requests +from datetime import datetime, timezone +from crawler.keyword_extractor import extract_keyword +from crawler.save_to_db import save_activities +from server.db import run_query + +PAGE_ENDPOINT = "https://app.unv.org/api/doa/doa/SearchDoaAsyncByAzureCognitive" +DETAIL_ENDPOINT = "https://app.unv.org/api/doa/doa/" +URL_BASE = "https://app.unv.org/opportunities/" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +DEFAULT_IMAGE_URL = "https://www.unv.org/sites/default/files/unvol.png" + +def get_latest_activity_id(): + query = """ + SELECT CAST(SUBSTRING_INDEX(site_url, '/', -1) AS UNSIGNED) as activity_id + FROM activities + WHERE activity_site = "UNVOLUNTEERS" + ORDER BY activity_id DESC + LIMIT 1 + """ + result = run_query(query) + + return int(result[0][0]) if result else 0 + +def get_total_count(): + payload = { + "take": 1, + "skip": 0 + } + response = requests.post(PAGE_ENDPOINT, headers=HEADERS, json=payload) + data = response.json() + total_count = data["value"]["total"] + + return total_count + +def iso_to_utc(date_str): + if not date_str: + return None + + return datetime.fromisoformat(date_str) + +def fetch_activity_id_list(): + latest_activity_id = get_latest_activity_id() + total_count = get_total_count() + + # API 요청 + response = requests.post( + PAGE_ENDPOINT, + headers=HEADERS, + json={"skip": 0, "take": total_count} + ) + activities = response.json()["value"]["result"] + + # 마지막 활동이 있으면 그 이후의 데이터만, 없으면 전체 데이터를 가져옴 + if latest_activity_id > 0: + print(f"[UNV] DB의 마지막 활동 이후 데이터만 크롤링 시작 (ID : {latest_activity_id})") + return [activity["id"] for activity in activities if activity["id"] > latest_activity_id] + else: + print(f"[UNV] DB에 활동 없음, 모든 데이터 크롤링 시작") + return [activity["id"] for activity in activities] + +def fetch_activity_detail(activity_id_list): + activities = [] + + for activity_id in activity_id_list: + response = requests.get(DETAIL_ENDPOINT + str(activity_id), headers=HEADERS) + data = response.json()['value'] + + activity_content = ( + f"[Mission and objectives] : {data.get('organizationMission', '')}" + f"[Context] : {data.get('context', '')}" + f"[Task description] : {data.get('taskDescription', '')}" + f"[Required experience]: {data.get('requiredSkillExperience', '')}" + ) + activity_name = data.get("name") + start_date = iso_to_utc(data.get("publishDate")) + end_date = data.get("sourcingEndDate") + site_url = URL_BASE + str(activity_id) + keyword = extract_keyword(data.get('organizationMission') or activity_name) + + activities.append( + { + "activity_site": "UNVOLUNTEERS", + "activity_type": "VOLUNTEER", + "activity_content": activity_content, + "end_date": end_date, + "site_url": site_url, + "activity_image_url": DEFAULT_IMAGE_URL, + "keyword": keyword, + "activity_name": activity_name, + "start_date": start_date + }) + + print(f"[UNV] 활동 크롤링 완료 : {activity_name}") + + return activities + +def crawl(): + print("[UNV] 크롤링 시작") + activity_id_list = fetch_activity_id_list() + + if activity_id_list: + activities = fetch_activity_detail(activity_id_list) + print(f"[UNV] 크롤링 완료 : {len(activity_id_list)}개의 활동을 크롤링했습니다.") + save_activities(activities) + else: + print("[UNV] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() \ No newline at end of file diff --git a/crawler/v1365_crawler.py b/crawler/v1365_crawler.py new file mode 100644 index 0000000..780fd56 --- /dev/null +++ b/crawler/v1365_crawler.py @@ -0,0 +1,173 @@ +import re +import httpx +import asyncio +import requests +from bs4 import BeautifulSoup +from server.db import run_query +from crawler.save_to_db import save_activities +from crawler.keyword_extractor import extract_keyword +from itertools import chain + +LIST_ENDPOINT = "https://www.1365.go.kr/vols/1572247904127/partcptn/timeCptn.do" +DETAIL_ENDPOINT = "https://www.1365.go.kr/vols/1572247904127/partcptn/timeCptn.do?type=show&progrmRegistNo=" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +DEFAULT_IMAGE_URL = "https://play-lh.googleusercontent.com/9Kheg_iekobkZlP9XzKtwv_j_YL88oVzHCtHe4_hIL3JcQabCL3FFEw4vKzL1XQc8GE" +BATCH_SIZE = 5 # 한번에 BATCH_SIZE개의 HTTP 요청을 보냄 +MAX_CRAWL_PAGE = 10 # 크롤링할 페이지 수 + +async def get_soup(url, params=None): + """URL에 GET 요청을 보내고 BeautifulSoup 객체를 반환""" + async with httpx.AsyncClient() as client: + response = await client.get(url, params=params, headers=HEADERS) + return BeautifulSoup(response.text, "html.parser") + +def get_exist_ids(): + """DB에서 이미 존재하는 모든 활동 ID들을 리스트로 반환""" + sql = """ + SELECT CAST( + SUBSTRING_INDEX(site_url, 'progrmRegistNo=', -1) AS UNSIGNED) AS id + FROM activities + WHERE activity_site = "KRVOLUNTEERS" + ORDER BY id DESC + """ + result = run_query(sql) + return [int(row[0]) for row in result] if result else [] + +def get_last_page(): + """1365사이트의 마지막 페이지 번호를 반환""" + params = { + "requstSe": "N", + "adultPosblAt": "Y", + "yngbgsPosblAt": "Y", + } + response = requests.get(LIST_ENDPOINT, params=params, headers=HEADERS) + soup = BeautifulSoup(response.text, "html.parser") + + btn_last = soup.find('a', class_='btn_last') + last_page = btn_last.get('href').split('=')[-1] + + return int(last_page) + +def extract_name(soup): + """상세 페이지에서 활동 이름을 추출""" + name = soup.select_one('h3.tit_board_view input').get('value') + return name if name else None + +def extract_dates(soup): + """상세 페이지에서 봉사기간 시작일, 종료일을 추출""" + period = soup.find('dt', string='봉사기간') + if period: + period = period.find_next('dd').text + start_date, end_date = period.split(' ~ ') + return start_date.replace('.', '-'), end_date.replace('.', '-') + return None, None + +def extract_content(soup): + """상세 페이지에서 활동 내용을 추출""" + pre_tag = soup.find('pre') + if pre_tag: + return re.sub(r'[\r\n]+', ' ', pre_tag.get_text(separator="\n", strip=True)) + return "" + +async def extract_ids(page): + """해당 페이지의 활동 ID들을 리스트 형태로 반환""" + params = { + "cPage": page, + "requstSe": "N", + "adultPosblAt": "Y", + "yngbgsPosblAt": "Y", + } + soup = await get_soup(LIST_ENDPOINT, params=params) + + id_list = [] + ul = soup.select_one("ul.list_wrap.wrap2") + if ul: + a_tags = ul.find_all("a", href=True) + for a in a_tags: + href = a['href'] + match = re.search(r'show\((\d+)\)', href) + if match: + id = int(match.group(1)) + id_list.append(id) + + return id_list + +async def fetch_detail(id): + """해당 ID에 해당하는 활동의 상세정보를 추출""" + url = f"{DETAIL_ENDPOINT}{id}" + soup = await get_soup(url) + if not soup: + return None + + start_date, end_date = extract_dates(soup) + activity_content = extract_content(soup) + keyword = extract_keyword(activity_content) + activity_name = extract_name(soup) + + return { + "activity_site": "KRVOLUNTEERS", + "activity_type": "VOLUNTEER", + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": DEFAULT_IMAGE_URL, + "keyword": keyword, + "activity_name": activity_name, + "site_url": url, + "start_date": start_date + } + +async def crawl_async(): + """비동기적으로 1365 자원봉사 사이트에서 활동 정보를 수집""" + last_page = get_last_page() + start_page = max(last_page - MAX_CRAWL_PAGE, 1) # 시작할 페이지 계산 + exist_ids = get_exist_ids() + id_list = [] + activities = [] + print(f"[1365] 최근 {MAX_CRAWL_PAGE} 개의 페이지 ({start_page} ~ {last_page}) 에서 ID 수집중... ") + + # ID 수집 (start_page부터 last_page까지 BATCH_SIZE씩 증가) + for start in range(start_page, last_page + 1, BATCH_SIZE): + tasks = [] + end = min(start + BATCH_SIZE, last_page + 1) + + for current_page in range(start, end): + tasks.append(extract_ids(current_page)) + + result = await asyncio.gather(*tasks) + id_list.extend(chain.from_iterable(result)) + + # DB와 비교하여 새로운 ID만 남김 + filtered_id_list = list(set(id_list) - set(exist_ids)) + if not filtered_id_list: + return [] + print(f"[1365] {len(filtered_id_list)} 개의 새로운 활동 ID 수집 완료") + + # DB에 없는 새로운 활동의 상세정보 수집 (BATCH_SIZE 단위로) + for i in range(0, len(filtered_id_list), BATCH_SIZE): + batch = filtered_id_list[i:i + BATCH_SIZE] + detail_tasks = [fetch_detail(id) for id in batch] + try: + print(f"[1365] {len(filtered_id_list)} 개의 활동 중 {i+1} ~ {i+BATCH_SIZE} 의 상세정보 수집 중...") + results = await asyncio.gather(*detail_tasks) + # None이 아닌 결과만 추가 + activities.extend([r for r in results if r is not None]) + except Exception as e: + print(f"Error processing batch {i}: {e}") + continue + + return activities + +def crawl(): + """외부 호출용 크롤링 함수""" + print("[1365] 크롤링 시작") + activities = asyncio.run(crawl_async()) + if activities: + print(f"[1365] 크롤링 완료 : {len(activities)}개의 활동을 크롤링했습니다.") + save_activities(activities) + else: + print("[1365] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == '__main__': + crawl() \ No newline at end of file diff --git a/crawler/wevity_crawler.py b/crawler/wevity_crawler.py new file mode 100644 index 0000000..c682a4d --- /dev/null +++ b/crawler/wevity_crawler.py @@ -0,0 +1,190 @@ +import requests +import time +from bs4 import BeautifulSoup +import re +from datetime import datetime +from urllib.parse import urlparse, parse_qs +from server.db import run_query +from crawler.save_to_db import save_activities +from crawler.keyword_extractor import extract_keyword + +BASE_URL = "https://www.wevity.com" +FILE_NAME = "data/wevity_data.json" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +MAX_CRAWL_PAGE = 10 + +def get_soup(url): + """웹 페이지를 요청하고 BeautifulSoup 객체 반환""" + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + +def is_special_activity(a_tag): + """SPECIAL 게시물인지 확인""" + return bool(a_tag.select_one("span.stat.spec")) + +def get_latest_activity_id(): + """DB에서 가장 마지막 활동을 조회""" + sql = """ + SELECT CAST( + SUBSTRING_INDEX( + SUBSTRING_INDEX(site_url, 'ix=', -1), + '&', 1 + ) AS UNSIGNED + ) as activity_id + FROM activities + WHERE activity_site = "WEVITY" + ORDER BY activity_id DESC + LIMIT 1; + """ + result = run_query(sql) + + if result and result[0][0]: + return int(result[0][0]) + else: + return 0 + +def get_image_url(soup): + """썸네일 이미지 URL 추출""" + img_tag = soup.select_one("div.thumb img") + if not img_tag or not img_tag.has_attr("src"): + return "" + img_src = img_tag["src"] + return BASE_URL + img_src if img_src.startswith("/") else img_src + +def get_date_range(soup): + """접수기간 추출""" + for li in soup.select("li"): + if "접수기간" in li.get_text(): + match = re.search(r'(\d{4}-\d{2}-\d{2})\s*~\s*(\d{4}-\d{2}-\d{2})', li.get_text()) + if match: + try: + start_date = datetime.strptime(match.group(1), "%Y-%m-%d").replace(hour=0, minute=0, second=0, microsecond=0).isoformat() + end_date = datetime.strptime(match.group(2), "%Y-%m-%d").replace(hour=23, minute=59, second=59, microsecond=999999).isoformat() + return start_date, end_date + except ValueError: + pass + return None, None + +def get_activity_type(soup): + """활동 유형(카테고리) 결정""" + li_tag = soup.select_one("ul.cd-info-list li") + + if li_tag: + span_tag = li_tag.find("span", class_="tit") + span_tag.decompose() + category_text = li_tag.get_text(strip=True) + + if category_text == "대외활동/서포터즈": + return "SUPPORTERS" + elif category_text == "봉사활동": + return "VOLUNTEER" + else: + return "CONTEST" + +def get_activity_urls(list_url, last_activity_id): + """활동 목록 페이지에서 새로운 활동 URL들을 수집""" + activity_urls = [] + soup = get_soup(list_url) + activity_items = soup.select("ul.list li") + + for item in activity_items: + # 진행 중인 게시물만 처리 + if item.select_one("span.dday.end"): + continue + + link_tag = item.select_one("a") + if not link_tag: + continue + + activity_url = BASE_URL + link_tag['href'] + # URL에서 활동 ID 추출 + parsed = urlparse(activity_url) + query_params = parse_qs(parsed.query) + current_activity_id = int(query_params.get('ix', ['0'])[0]) + + # ID값이 ID의 마지막 활동 id보다 크면 추가 + if current_activity_id > last_activity_id: + activity_urls.append(activity_url) + # 특별 게시물이 아닌 경우에 ix 값이 작거나 같으면 더 이상 새로운 게시물이 없으므로 종료 + elif not is_special_activity(link_tag): + return activity_urls + + return activity_urls + +def get_activity_detail(url): + """활동 상세 페이지에서 데이터 추출""" + try: + soup = get_soup(url) + + activity_type = get_activity_type(soup) + activity_content = soup.select_one("#viewContents").get_text(strip=True) or None + activity_name = soup.select_one("h6.tit").get_text(strip=True) or None + start_date, end_date = get_date_range(soup) + activity_image_url = get_image_url(soup) + keyword = extract_keyword(activity_content) + + return { + "activity_site": "WEVITY", + "activity_type": activity_type, + "activity_content": activity_content, + "end_date": end_date, + "activity_image_url": activity_image_url, + "keyword": keyword, + "activity_name": activity_name, + "site_url": url, + "start_date": start_date + } + + except Exception as e: + print(f"[ERROR] {url} 에서 오류 발생: {e}") + return None + +def crawl(): + """위비티 활동 크롤링 실행""" + print("[WEVITY] 크롤링 시작") + + last_activity_id = get_latest_activity_id() + + if last_activity_id > 0: + print(f"[WEVITY] DB의 마지막 활동 이후 데이터만 크롤링 시작 (ID : {last_activity_id})") + else: + print(f"[WEVITY] DB에 활동 없음, 모든 데이터 크롤링 시작") + + collected_urls = [] + page = 1 + + print("[WEVITY] 페이지별 활동 링크 수집 중...") + while True and page <= MAX_CRAWL_PAGE: + paged_url = f"{BASE_URL}/?c=find&s=1&gp={str(page)}" + try: + new_urls = get_activity_urls(paged_url, last_activity_id) + if not new_urls: + break + collected_urls.extend(new_urls) + page += 1 + except Exception as e: + print(f"[ERROR] 목록 페이지 {paged_url} 에서 오류 발생: {e}") + break + + crawled_activities = [] + if collected_urls: + print(f"[WEVITY] {len(collected_urls)}개의 활동 링크 수집 완료") + print("[WEVITY] 활동 상세내용 크롤링 중...") + for url in collected_urls: + activity_data = get_activity_detail(url) + time.sleep(1.1) # LLM API 요청 간 시간 간격을 두기 위해 1.1초 대기 + if activity_data: + crawled_activities.append(activity_data) + print(f"[WEVITY] 활동 크롤링 완료 : {activity_data['activity_name']}") + + if crawled_activities: + print(f"[WEVITY] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.") + save_activities(crawled_activities) + else: + print("[WEVITY] 크롤링 완료 : 새로운 활동이 없습니다.") + +if __name__ == "__main__": + crawl() diff --git a/custom_keyword/ext.py b/custom_keyword/ext.py index aa88bcd..d413803 100644 --- a/custom_keyword/ext.py +++ b/custom_keyword/ext.py @@ -1,11 +1,9 @@ from transformers import BertTokenizer, BertModel from sklearn.metrics.pairwise import cosine_similarity -import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') -domain_keywords = ["environment", "Society", "Economic", "technology"] +domain_keywords = ["Economy", "Environment", "Technology", "People", "Society"] def get_embeddings(text: str): inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) @@ -16,10 +14,22 @@ def calculate_cosine_similarity(vec1, vec2): return cosine_similarity(vec1, vec2) def extract_keywords(question: str): + """ + 텍스트로부터 키워드를 추출하는 함수 + + Args: + question (str): 키워드를 추출하고자 하는 문자열 + Returns: + str: 가장 유사도가 높은 키워드 (DB enum 형식) + """ sentence_embedding = get_embeddings(question) domain_embeddings = [get_embeddings(keyword) for keyword in domain_keywords] similarities = [ (keyword, calculate_cosine_similarity(sentence_embedding, embedding)[0][0]) for keyword, embedding in zip(domain_keywords, domain_embeddings) ] - return sorted(similarities, key=lambda x: x[1], reverse=True) \ No newline at end of file + extracted_keyword = max(similarities, key=lambda x: x[1])[0] + if extracted_keyword.lower() in ["people", "society"]: + return "PeopleAndSociety" + else: + return extracted_keyword diff --git a/deploy_script.sh b/deploy_script.sh new file mode 100644 index 0000000..bc0ae1d --- /dev/null +++ b/deploy_script.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +cd ~/HIGHFIVE-AI/ || exit + +# 패키지 업데이트 및 설치 +pip-compile requirements.in +pip install -r requirements.txt + +# 기존의 스크린 삭제 후 재실행 +screen -S flask-server -X quit +screen -dmS flask-server bash -c "cd ~/HIGHFIVE-AI && python app.py" +echo "Flask is running in a screen session." \ No newline at end of file diff --git a/ocr/__init__.py b/ocr/__init__.py index 6c88272..fb84993 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -1,25 +1,89 @@ -from flask import Blueprint, request, jsonify + +from flask import Blueprint, jsonify, request +from flasgger import Swagger, swag_from + from server.logger import logger -from .o import extract_text, compare_texts +from .o import is_review_valid + ocr_bp = Blueprint('ocr', __name__, url_prefix='/ocr') -@ocr_bp.route('/', methods=['GET']) -def evaluate_image(review_id): - - # 데이터베이스에서 review_id를를 토대로 데이터를 가져옴(미구현) - img_path="" - compare_text = "" +@ocr_bp.route('/', methods=['POST']) +@swag_from({ + 'summary': 'OCR 이미지 비교 API', + 'description': '이미지에서 텍스트를 추출하고 비교하는 API', + 'parameters': [ + { + 'name': 'body', + 'in': 'body', + 'required': True, + 'schema': { + 'type': 'object', + 'properties': { + 'imageUrls': { + 'type': 'array', + 'items': {'type': 'string'}, + 'description': '검토할 이미지 URL 리스트' + }, + 'awardImageUrl': { + 'type': 'string', + 'description': '수상 이미지의 URL' + }, + 'title': { + 'type': 'string', + 'description': '비교할 기준 텍스트' + } + } + } + } + ], + 'responses': { + 200: { + 'description': 'OCR 결과 반환', + 'schema': { + 'type': 'object', + 'properties': { + 'ocrResult': {'type': 'string', 'description': 'OCR 비교 결과'}, + 'awardOcrResult': {'type': 'string', 'description': '수상 이미지 OCR 비교 결과'} + } + } + }, + 500: { + 'description': '서버 에러 발생', + 'schema': { + 'type': 'object', + 'properties': { + 'answer': {'type': 'string', 'description': '에러 메시지'} + } + } + } + } +}) + +def evaluate_image(): + + + data=request.get_json() + + imageUrls=data.get("imageUrls") + awardImageUrl=data.get("awardImageUrl") + title=data.get("title") # OCR 실행 - extracted_text = extract_text(img_path) - - # 비교 실행 - result = compare_texts(extracted_text, compare_text) + if imageUrls: + # ocr결과의 기본값은 False + ocr_result = "False" + ocr_result=is_review_valid(title, imageUrls) + if awardImageUrl != None: + awardImageUrlList=[awardImageUrl] + award_ocr_result=is_review_valid(title,awardImageUrlList) + else: + award_ocr_result = "False" + try: - return jsonify({"llm_validation": result, - "review_id": review_id}), 200 + return jsonify({"ocrResult": ocr_result, + "awardOcrResult": award_ocr_result}), 200 except Exception as e: logger.error(e) return jsonify({"answer": f"죄송합니다. 에러가 발생했습니다."}), 500 \ No newline at end of file diff --git a/ocr/o.py b/ocr/o.py index a505897..4dc288f 100644 --- a/ocr/o.py +++ b/ocr/o.py @@ -1,37 +1,82 @@ import torch +import os +import requests +from google import genai +from google.genai import types +from dotenv import load_dotenv from paddleocr import PaddleOCR from openai import OpenAI +# .env파일 로드 +load_dotenv() -client = OpenAI(api_key="") # 나중에 api키 교체 -model = "gpt-4" -ocr = PaddleOCR(lang="korean") +api_key = os.getenv("GEMINI_API_KEY") +MODEL_NAME = "gemini-1.5-flash" -def extract_text(img_path): - """ 이미지에서 텍스트 추출 """ - results = ocr.ocr(img_path, cls=True) - return " ".join(text for result in results for _, (text, _) in result) +def is_review_valid(title: str, image_urls: list[str]) -> bool: + """ + 리뷰 제목과 이미지들을 기반으로 리뷰가 유효한지 판단합니다. + + Args: + title (str): 리뷰 제목 + image_urls (list[str]): 이미지 URL 리스트 (최대 5개 권장) + + Returns: + bool: 리뷰가 유효하면 True, 그렇지 않으면 False + """ + + image_parts = [] + + # 각 이미지 URL을 순회하며 이미지 바이트를 가져와 types.Part 객체로 변환 + for url in image_urls[:5]: # 최대 5개 이미지만 처리 + if url: # URL이 비어있지 않은지 확인 + try: + response = requests.get(url, timeout=5) # 타임아웃 추가 + response.raise_for_status() # HTTP 오류 (4xx, 5xx) 발생 시 예외 발생 + + # MIME 타입 확인 (없으면 기본값 사용) + content_type = response.headers.get('Content-Type', 'image/jpeg') + + # 이미지 바이트를 types.Part 객체로 변환하고 리스트에 추가 + image_part = types.Part.from_bytes(data=response.content, mime_type=content_type) + image_parts.append(image_part) + except requests.exceptions.RequestException as e: + print(f"경고: 이미지를 가져오거나 처리하는 데 실패했습니다. URL: {url}, 오류: {e}") + continue + except Exception as e: + print(f"경고: 이미지 {url} 처리 중 예상치 못한 오류 발생: {e}") + continue -def compare_texts(text1, text2): - """ 두 텍스트 간의 관계 분석 """ - prompt = f""" - Analyze the relationship between the following two texts. Determine whether they are conceptually or contextually related. - If they are related, return True; otherwise, return False without additional explanation + if not image_parts: + print("경고: 유효한 이미지를 찾거나 가져오지 못했습니다. False를 반환합니다.") + return False # 이미지가 없거나 모두 실패하면 유효하지 않다고 판단 - Text 1: - {text1} - Text 2: - {text2} + prompt = f"""리뷰 제목: "{title}" + 리뷰의 제목과, 이미지들에 포함된 텍스트를 하나씩 비교합니다. + 하나라도 맞는 경우 문자열 True를 모두 아닐 경우 False를 반환합니다. """ - response = client.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": "You are an objective analyst. Compare the following two texts and determine their relationship strictly based on content."}, - {"role": "user", "content": prompt} - ], - temperature=0, - max_tokens=600 - ) - - return response.choices[0].message.content.strip() \ No newline at end of file + # 텍스트 프롬프트와 모든 이미지 파트를 contents 리스트로 결합 + contents = [prompt] + image_parts + + client = genai.Client(api_key=api_key) + try: + response = client.models.generate_content( + model=MODEL_NAME, + contents=contents, + config={ + # 응답형식을 True, False로 제한 + 'response_mime_type': 'text/x.enum', + 'response_schema': { + "type": "STRING", + "enum": ["True", "False"] + } + } + ) + + print("responsetext: ",response.text) + return response.text.strip() == "True" + + except Exception as e: + print(f"API 호출 실패: {e}") + return False # API 호출 실패 시 유효하지 않다고 판단 \ No newline at end of file diff --git a/requirements.in b/requirements.in index 31d29f0..43a6011 100644 --- a/requirements.in +++ b/requirements.in @@ -5,6 +5,7 @@ flask flask-cors requests python-dotenv +flasgger # Bert 임베딩 기반 유사도 추정 torch @@ -13,10 +14,8 @@ scikit-learn # Generative AI openai +google-genai>=0.1.0 -# OCR -paddlepaddle -paddleocr # MySQL mysql-connector-python diff --git a/requirements.txt b/requirements.txt index e69de29..b02b51a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,198 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements.in +# +annotated-types==0.7.0 + # via pydantic +anyio==4.9.0 + # via + # google-genai + # httpx + # openai +attrs==25.3.0 + # via + # jsonschema + # referencing +blinker==1.9.0 + # via flask +cachetools==5.5.2 + # via google-auth +certifi==2025.1.31 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.4.1 + # via requests +click==8.1.8 + # via flask +colorama==0.4.6 + # via + # click + # tqdm +distro==1.9.0 + # via openai +filelock==3.18.0 + # via + # huggingface-hub + # torch + # transformers +flasgger==0.9.7.1 + # via -r requirements.in +flask==3.1.0 + # via + # -r requirements.in + # flasgger + # flask-cors +flask-cors==5.0.1 + # via -r requirements.in +fsspec==2025.3.2 + # via + # huggingface-hub + # torch +google-auth==2.40.2 + # via google-genai +google-genai==1.18.0 + # via -r requirements.in +h11==0.16.0 + # via httpcore +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # google-genai + # openai +huggingface-hub==0.31.1 + # via + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # httpx + # requests +itsdangerous==2.2.0 + # via flask +jinja2==3.1.6 + # via + # flask + # torch +jiter==0.9.0 + # via openai +joblib==1.5.0 + # via scikit-learn +jsonschema==4.23.0 + # via flasgger +jsonschema-specifications==2025.4.1 + # via jsonschema +markupsafe==3.0.2 + # via + # jinja2 + # werkzeug +mistune==3.1.3 + # via flasgger +mpmath==1.3.0 + # via sympy +mysql-connector-python==9.3.0 + # via -r requirements.in +networkx==3.4.2 + # via torch +numpy==2.2.5 + # via + # scikit-learn + # scipy + # transformers +openai==1.78.1 + # via -r requirements.in +packaging==25.0 + # via + # flasgger + # huggingface-hub + # transformers +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pydantic==2.11.4 + # via + # google-genai + # openai +pydantic-core==2.33.2 + # via pydantic +python-dotenv==1.1.0 + # via -r requirements.in +pyyaml==6.0.2 + # via + # flasgger + # huggingface-hub + # transformers +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +regex==2024.11.6 + # via transformers +requests==2.32.3 + # via + # -r requirements.in + # google-genai + # huggingface-hub + # transformers +rpds-py==0.24.0 + # via + # jsonschema + # referencing +rsa==4.9.1 + # via google-auth +safetensors==0.5.3 + # via transformers +scikit-learn==1.6.1 + # via -r requirements.in +scipy==1.15.3 + # via scikit-learn +six==1.17.0 + # via flasgger +sniffio==1.3.1 + # via + # anyio + # openai +sympy==1.14.0 + # via torch +threadpoolctl==3.6.0 + # via scikit-learn +tokenizers==0.21.1 + # via transformers +torch==2.7.0 + # via -r requirements.in +tqdm==4.67.1 + # via + # huggingface-hub + # openai + # transformers +transformers==4.51.3 + # via -r requirements.in +typing-extensions==4.13.2 + # via + # anyio + # google-genai + # huggingface-hub + # openai + # pydantic + # pydantic-core + # referencing + # torch + # typing-inspection +typing-inspection==0.4.0 + # via pydantic +urllib3==2.3.0 + # via requests +websockets==15.0.1 + # via google-genai +werkzeug==3.1.3 + # via + # flask + # flask-cors diff --git a/server/db.py b/server/db.py index 8bb8d5b..eb8ff68 100644 --- a/server/db.py +++ b/server/db.py @@ -16,8 +16,23 @@ def run_query(query: str, params=None): conn = pool.get_connection() cursor = conn.cursor() - cursor.execute(query, params) - results = cursor.fetchall() - cursor.close() - conn.close() # 풀로 반환 - return results + try: + affected_rows = 0 + + # 여러 레코드 삽입인 경우 + if params and isinstance(params, list) and isinstance(params[0], tuple): + cursor.executemany(query, params) + affected_rows = cursor.rowcount + else: + cursor.execute(query, params) + affected_rows = cursor.rowcount + + # SELECT 처리 + if query.strip().lower().startswith("select"): + return cursor.fetchall() + else: + conn.commit() + return affected_rows + finally: + cursor.close() + conn.close() \ No newline at end of file diff --git a/summarization/sum_translate.py b/summarization/sum_translate.py index d9900bc..db9092c 100644 --- a/summarization/sum_translate.py +++ b/summarization/sum_translate.py @@ -1,29 +1,38 @@ from openai import OpenAI -client = OpenAI(api_key="") # 나중에 api_key 교체 +import os +from dotenv import load_dotenv + +# .env파일 로드 +load_dotenv() + +# 환경 변수에서 API 키 가져오기 +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) model = "gpt-4" -def summarize_translate_en_to_ko(text: str) -> str: +def translate_en_to_ko(text: str) -> str: """ - 영어 텍스트를 한국어로 번역하고 요약. - :param text: 번역 및 요약할 영어 문장 - :return: 요약된 한국어 번역 결과 + 영어 텍스트를 한국어로 번역. + + Args: + text (str): 번역하고자 하는 원문(영어) 텍스트 + Returns: + str: 한국어 번역 결과 """ prompt = f""" - Translate and summarize the following English text **into Korean** in **one or two sentences only**. - Focus on capturing the key message, and write naturally in Korean. + Translate the following English text **into Korean**. + Maintain the original tone and context as accurately as possible. - Text: {text} """ response = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": "You are a professional translator and summarizer."}, + {"role": "system", "content": "You are a professional translator."}, {"role": "user", "content": prompt} ], temperature=0.3, max_tokens=600 ) - return response.choices[0].message.content.strip() \ No newline at end of file + return response.choices[0].message.content.strip()