From 9124af9aee711088ba4887fe948e93182b05dfd3 Mon Sep 17 00:00:00 2001 From: Nguyen Viet Quyen Date: Wed, 4 Jun 2025 11:39:46 +0700 Subject: [PATCH 1/3] Refactor image translation: split blocks, return correct format, integrate translation --- backend/api/image_routers.py | 19 + backend/main.py | 615 ++++++++++++------ backend/models/image_translation.py | 1 + backend/requirements.txt | 6 + backend/services/image_translation_service.py | 50 ++ backend/services/punctuation.py | 7 +- backend/services/stt_service.py | 2 +- 7 files changed, 490 insertions(+), 210 deletions(-) create mode 100644 backend/api/image_routers.py create mode 100644 backend/models/image_translation.py create mode 100644 backend/services/image_translation_service.py diff --git a/backend/api/image_routers.py b/backend/api/image_routers.py new file mode 100644 index 0000000..8a0a7dd --- /dev/null +++ b/backend/api/image_routers.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter, UploadFile, File +from backend.services.image_translation_service import ImageTranslationService + +router = APIRouter() +image_service = ImageTranslationService( + api_key="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" +) +IMGBB_API_KEY = "785e57ea3a903daeaa39a49f3cc8bf38" + + +@router.post("/test-ocr") +async def test_ocr(file: UploadFile = File(...)): + try: + image_bytes = await file.read() + image_url = image_service.upload_image_to_imgbb(image_bytes, IMGBB_API_KEY) + sentences = image_service.extract_sentences_with_positions(image_url) + return {"success": True, "sentences": sentences} + except Exception as e: + return {"success": False, "error": str(e)} diff --git a/backend/main.py b/backend/main.py index fec566c..1c97d69 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,3 +1,312 @@ +# import sys +# from pathlib import Path +# from fastapi import FastAPI, WebSocket, WebSocketDisconnect +# from fastapi.middleware.cors import CORSMiddleware +# from fastapi.responses import HTMLResponse +# from fastapi.templating import Jinja2Templates +# from fastapi.staticfiles import StaticFiles +# from backend.api import file_routes, feedback_routes +# from starlette.requests import Request +# import json +# import asyncio +# from backend.utils.logger import setup_logger +# from backend.database import engine +# from backend.models.feedback_model import Base +# import base64 + +# # Import hai hàm PUNCTUATION +# from backend.services.punctuation import restore_punctuation, capitalize_after_punctuation + +# # Initialize database +# Base.metadata.create_all(bind=engine) + +# # Thiết lập logger +# logger = setup_logger() + +# # Thêm đường dẫn services vào sys.path để import các service khác (translate, stt, tts) +# sys.path.append(str(Path(__file__).parent / "services")) +# try: +# from translation_service import translate_text +# except ImportError as e: +# print(f"Import error: {e}") +# raise + +# sys.path.append(str(Path(__file__).parent / "services")) +# try: +# from stt_service import speech_to_text +# except ImportError as e: +# print(f"Import error: {e}") +# raise + +# sys.path.append(str(Path(__file__).parent / "services")) +# try: +# from tts_service import text_to_speech +# except ImportError as e: +# print(f"Import error: {e}") +# raise + +# def handle_audio(base64_audio_str: str): +# audio_bytes = base64.b64decode(base64_audio_str) +# result = speech_to_text(audio_bytes) +# return result + +# # === TẠO ỨNG DỤNG FASTAPI === +# app = FastAPI() + +# # Include các router HTTP nếu có +# app.include_router(file_routes.router, prefix="/api", tags=["file-translator"]) +# app.include_router(feedback_routes.router, prefix="/api", tags=["feedback"]) + +# # Cấu hình CORS +# app.add_middleware( +# CORSMiddleware, +# allow_origins=["*"], +# allow_credentials=True, +# allow_methods=["*"], +# allow_headers=["*"], +# ) + +# # Cấu hình static files & templates +# app.mount("/static", StaticFiles(directory="frontend/static"), name="static") +# templates = Jinja2Templates(directory="frontend/templates") + +# # === ROUTES GET CHUYÊN BIỆT === +# @app.get("/", response_class=HTMLResponse) +# async def home(request: Request): +# logger.info("Home page accessed") +# return templates.TemplateResponse("home.html", {"request": request}) + +# @app.get("/live-translator", response_class=HTMLResponse) +# async def live_translator(request: Request): +# logger.info("Live translator page accessed") +# return templates.TemplateResponse("live_translator.html", {"request": request}) + +# @app.get("/file-translator", response_class=HTMLResponse) +# async def file_translator(request: Request): +# logger.info("File translator page accessed") +# return templates.TemplateResponse("file_translator.html", {"request": request}) + +# @app.get("/about", response_class=HTMLResponse) +# async def about(request: Request): +# logger.info("About page accessed") +# return templates.TemplateResponse("about.html", {"request": request}) + +# @app.get("/contact", response_class=HTMLResponse) +# async def contact(request: Request): +# logger.info("Contact page accessed") +# return templates.TemplateResponse("contact.html", {"request": request}) + + +# # === WEBSOCKET ENDPOINT === +# @app.websocket("/ws") +# async def websocket_endpoint(websocket: WebSocket): +# await websocket.accept() +# logger.info("Client connected via WebSocket") +# # Ngôn ngữ mặc định (dùng cho STT hoặc TTS) +# lang = "vi" + +# try: +# while True: +# try: +# # Chờ client gửi text (JSON string) trong 30s +# raw = await asyncio.wait_for(websocket.receive_text(), timeout=30.0) +# except asyncio.TimeoutError: +# # Nếu 30s không có message, gửi ping để giữ kết nối +# try: +# await websocket.send_json({"type": "ping"}) +# except WebSocketDisconnect: +# # Nếu client đã ngắt lúc ping, thoát vòng lặp +# break +# continue +# except WebSocketDisconnect: +# # Client đóng kết nối +# logger.info("Client disconnected (during receive)") +# break + +# # Nếu nhận được raw text, parse JSON +# try: +# message = json.loads(raw) +# except json.JSONDecodeError: +# # Nếu JSON sai định dạng, gửi text lỗi rồi tiếp tục +# try: +# await websocket.send_text(f"Invalid JSON: {raw}") +# except WebSocketDisconnect: +# pass +# continue + +# msg_type = message.get("type") + +# # ---------------------------- +# # 1) PUNCTUATION (chèn dấu câu + viết hoa) +# # Client phải gửi: { "type": "punctuation", "text": "" } +# # Server trả về: { "type": "punctuated", "text": "" } +# # ---------------------------- +# # if msg_type == "punctuation": +# # original = message.get("text", "").strip() +# # if not original: +# # try: +# # await websocket.send_json({ +# # "type": "error", +# # "message": "Empty text field for punctuation" +# # }) +# # except WebSocketDisconnect: +# # pass +# # continue + +# # # 1.a) Phục hồi dấu câu +# # punctuated = restore_punctuation(original) +# # # 1.b) Viết hoa sau dấu câu +# # normalized = capitalize_after_punctuation(punctuated) + +# # try: +# # await websocket.send_json({ +# # "type": "punctuation", +# # "text": normalized +# # }) +# # except WebSocketDisconnect: +# # # Nếu client đã ngắt, break +# # break + +# # ---------------------------- +# # 2) TRANSLATION (dịch văn bản) +# # Client gửi: { "type": "translate", "text": "...", "source_lang": "...", "target_lang": "..." } +# # Server trả: { "type": "translation", "text": "", "origin": "" } +# # ---------------------------- +# if msg_type == "translate": +# text_to_translate = message.get("text", "") +# source_lang = message.get("source_lang", "auto") +# target_lang = message.get("target_lang", "vi") + +# try: +# translated_text = translate_text(text_to_translate, source_lang, target_lang) +# logger.info(f"Translate [{source_lang}→{target_lang}]: {text_to_translate} → {translated_text}") +# await websocket.send_json({ +# "type": "translation", +# "text": str(translated_text), +# "origin": text_to_translate +# }) +# except Exception as e: +# logger.error(f"Translation error: {e}") +# try: +# await websocket.send_json({ +# "type": "error", +# "message": f"Translation failed: {e}" +# }) +# except WebSocketDisconnect: +# pass + +# # ---------------------------- +# # 3) TEXT-TO-SPEECH +# # Client gửi: { "type": "playText", "text": "...", "target_lang": "..." } +# # Server trả: { "type": "audio", "audio": , "text": "" } +# # ---------------------------- +# elif msg_type == "playText": +# text_to_play = message.get("text", "") +# target_lang = message.get("target_lang", "vi") +# try: +# audio_bytes = text_to_speech(text_to_play, target_lang) +# logger.info(f"TTS [{target_lang}]: {text_to_play}") +# await websocket.send_json({ +# "type": "audio", +# "audio": audio_bytes, +# "text": text_to_play +# }) +# except Exception as e: +# logger.error(f"TTS error: {e}") +# try: +# await websocket.send_json({ +# "type": "error", +# "message": f"TTS failed: {e}" +# }) +# except WebSocketDisconnect: +# pass + +# # ---------------------------- +# # 4) SPEECH-TO-TEXT (nếu bạn dùng binary audio frames) +# # Client có thể gửi: { "type": "whisper", "audio_base64": "..." } +# # (Hoặc kết hợp receive_bytes nếu không qua JSON) +# # Server trả: { "type": "STT", "text": "", ... } +# # ---------------------------- +# elif msg_type == "whisper": +# base64_audio = message.get("audio", "") +# language = message.get("language","") +# if not base64_audio: +# try: +# await websocket.send_json({ +# "type": "error", +# "message": "No audio_base64 field provided" +# }) +# except WebSocketDisconnect: +# pass +# continue + +# try: +# audio_bytes = base64.b64decode(base64_audio) +# text = speech_to_text(audio_bytes,language) +# logger.info(f"STT result: {text}") +# await websocket.send_json({ +# "type": "Whisper_result", +# "text": str(text) +# }) +# except Exception as e: +# logger.error(f"STT error: {e}") +# try: +# await websocket.send_json({ +# "type": "error", +# "message": f"Speech-to-text failed: {e}" +# }) +# except WebSocketDisconnect: +# pass + +# # ---------------------------- +# # 5) PING/PONG giữ kết nối +# # ---------------------------- +# elif msg_type == "ping": +# try: +# await websocket.send_json({"type": "pong"}) +# except WebSocketDisconnect: +# break + +# # ---------------------------- +# # 6) Nếu type không được định nghĩa, trả về echo +# # ---------------------------- +# else: +# try: +# await websocket.send_json({ +# "type": "echo", +# "text": f"Unknown type: {msg_type}" +# }) +# except WebSocketDisconnect: +# break + +# except WebSocketDisconnect: +# logger.info("Client disconnected (outer WebSocketDisconnect)") +# return + +# except Exception as e: +# # Bắt các lỗi bất ngờ khác +# logger.error(f"Unexpected WebSocket error: {e}") +# try: +# await websocket.send_json({ +# "type": "error", +# "message": str(e) +# }) +# except: +# pass + +# finally: +# # Đóng WebSocket nếu chưa đóng +# try: +# await websocket.close() +# except: +# pass + + +# # Nếu chạy main.py trực tiếp: +# if __name__ == "__main__": +# import uvicorn +# uvicorn.run("backend.main:app", host="0.0.0.0", port=8000, reload=True) + import sys from pathlib import Path from fastapi import FastAPI, WebSocket, WebSocketDisconnect @@ -5,17 +314,15 @@ from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles -from backend.api import file_routes, feedback_routes +from backend.api import file_routes +from backend.api import feedback_routes from starlette.requests import Request import json import asyncio from backend.utils.logger import setup_logger from backend.database import engine from backend.models.feedback_model import Base -import base64 - -# Import hai hàm PUNCTUATION -from backend.services.punctuation import restore_punctuation, capitalize_after_punctuation +from backend.api import image_routers # Initialize database Base.metadata.create_all(bind=engine) @@ -23,7 +330,7 @@ # Thiết lập logger logger = setup_logger() -# Thêm đường dẫn services vào sys.path để import các service khác (translate, stt, tts) +# Thêm thư mục chứa translation_service vào path sys.path.append(str(Path(__file__).parent / "services")) try: from translation_service import translate_text @@ -45,17 +352,11 @@ print(f"Import error: {e}") raise -def handle_audio(base64_audio_str: str): - audio_bytes = base64.b64decode(base64_audio_str) - result = speech_to_text(audio_bytes) - return result - -# === TẠO ỨNG DỤNG FASTAPI === +# Khởi tạo ứng dụng app = FastAPI() - -# Include các router HTTP nếu có app.include_router(file_routes.router, prefix="/api", tags=["file-translator"]) app.include_router(feedback_routes.router, prefix="/api", tags=["feedback"]) +app.include_router(image_routers.router, prefix="/api", tags=["image-translator"]) # Cấu hình CORS app.add_middleware( @@ -66,243 +367,145 @@ def handle_audio(base64_audio_str: str): allow_headers=["*"], ) -# Cấu hình static files & templates +# Cấu hình thư mục tĩnh và templates app.mount("/static", StaticFiles(directory="frontend/static"), name="static") templates = Jinja2Templates(directory="frontend/templates") -# === ROUTES GET CHUYÊN BIỆT === + +# Route chính @app.get("/", response_class=HTMLResponse) async def home(request: Request): logger.info("Home page accessed") + # Trả về template home.html return templates.TemplateResponse("home.html", {"request": request}) + @app.get("/live-translator", response_class=HTMLResponse) async def live_translator(request: Request): logger.info("Live translator page accessed") + # Trả về template live_translator.html return templates.TemplateResponse("live_translator.html", {"request": request}) + @app.get("/file-translator", response_class=HTMLResponse) async def file_translator(request: Request): logger.info("File translator page accessed") + # Trả về template file_translator.html return templates.TemplateResponse("file_translator.html", {"request": request}) + @app.get("/about", response_class=HTMLResponse) async def about(request: Request): logger.info("About page accessed") + # Trả về template about.html return templates.TemplateResponse("about.html", {"request": request}) + @app.get("/contact", response_class=HTMLResponse) async def contact(request: Request): logger.info("Contact page accessed") + # Trả về template contact.html return templates.TemplateResponse("contact.html", {"request": request}) -# === WEBSOCKET ENDPOINT === @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() - logger.info("Client connected via WebSocket") - # Ngôn ngữ mặc định (dùng cho STT hoặc TTS) + logger.info("Client connected") + # Ngôn ngữ mặc định lang = "vi" - try: while True: try: - # Chờ client gửi text (JSON string) trong 30s - raw = await asyncio.wait_for(websocket.receive_text(), timeout=30.0) - except asyncio.TimeoutError: - # Nếu 30s không có message, gửi ping để giữ kết nối - try: - await websocket.send_json({"type": "ping"}) - except WebSocketDisconnect: - # Nếu client đã ngắt lúc ping, thoát vòng lặp - break - continue - except WebSocketDisconnect: - # Client đóng kết nối - logger.info("Client disconnected (during receive)") - break + # Nhận dữ liệu từ WebSocket với timeout + logger.info("Waiting for data from client...") + data = await asyncio.wait_for(websocket.receive(), timeout=30.0) + logger.info(f"Received data: {data}") - # Nếu nhận được raw text, parse JSON - try: - message = json.loads(raw) - except json.JSONDecodeError: - # Nếu JSON sai định dạng, gửi text lỗi rồi tiếp tục - try: - await websocket.send_text(f"Invalid JSON: {raw}") - except WebSocketDisconnect: - pass - continue - - msg_type = message.get("type") - - # ---------------------------- - # 1) PUNCTUATION (chèn dấu câu + viết hoa) - # Client phải gửi: { "type": "punctuation", "text": "" } - # Server trả về: { "type": "punctuated", "text": "" } - # ---------------------------- - if msg_type == "punctuation": - original = message.get("text", "").strip() - if not original: - try: - await websocket.send_json({ - "type": "error", - "message": "Empty text field for punctuation" - }) - except WebSocketDisconnect: - pass - continue - - # 1.a) Phục hồi dấu câu - punctuated = restore_punctuation(original) - # 1.b) Viết hoa sau dấu câu - normalized = capitalize_after_punctuation(punctuated) - - try: - await websocket.send_json({ - "type": "punctuation", - "text": normalized - }) - except WebSocketDisconnect: - # Nếu client đã ngắt, break - break - - # ---------------------------- - # 2) TRANSLATION (dịch văn bản) - # Client gửi: { "type": "translate", "text": "...", "source_lang": "...", "target_lang": "..." } - # Server trả: { "type": "translation", "text": "", "origin": "" } - # ---------------------------- - elif msg_type == "translate": - text_to_translate = message.get("text", "") - source_lang = message.get("source_lang", "auto") - target_lang = message.get("target_lang", "vi") - - try: - translated_text = translate_text(text_to_translate, source_lang, target_lang) - logger.info(f"Translate [{source_lang}→{target_lang}]: {text_to_translate} → {translated_text}") - await websocket.send_json({ - "type": "translation", - "text": str(translated_text), - "origin": text_to_translate - }) - except Exception as e: - logger.error(f"Translation error: {e}") + # Kiểm tra loại dữ liệu là text hay audio + if "text" in data: try: - await websocket.send_json({ - "type": "error", - "message": f"Translation failed: {e}" - }) - except WebSocketDisconnect: - pass - - # ---------------------------- - # 3) TEXT-TO-SPEECH - # Client gửi: { "type": "playText", "text": "...", "target_lang": "..." } - # Server trả: { "type": "audio", "audio": , "text": "" } - # ---------------------------- - elif msg_type == "playText": - text_to_play = message.get("text", "") - target_lang = message.get("target_lang", "vi") - try: - audio_bytes = text_to_speech(text_to_play, target_lang) - logger.info(f"TTS [{target_lang}]: {text_to_play}") - await websocket.send_json({ - "type": "audio", - "audio": audio_bytes, - "text": text_to_play - }) - except Exception as e: - logger.error(f"TTS error: {e}") + message = json.loads(data["text"]) + + if message.get("type") == "translate": + text = message.get("text", "") + source_lang = message.get("source_lang", "auto") + target_lang = message.get("target_lang", "vi") + + # Dịch văn bản + translated_text = translate_text( + text, source_lang, target_lang + ) + logger.info( + f"Translate from {source_lang} to {target_lang}: {text} -> {translated_text}" + ) + await websocket.send_json( + { + "type": "translation", + "text": str(translated_text), + "origin": text, + } + ) + elif message.get("type") == "playText": + text = message.get("text", "") + target_lang = message.get("target_lang", "vi") + # Chuyển văn bản thành âm thanh + audio = text_to_speech(text, target_lang) + logger.info(f"Text to Speech with {target_lang}: {text}") + # Mã hóa dữ liệu âm thanh thành base64 + await websocket.send_json( + {"type": "audio", "audio": audio, "text": text} + ) + elif message.get("type") == "audio": + lang = message.get("lang", "vi") + logger.info(f"Language set to: {lang}") + except json.JSONDecodeError: + logger.error("Invalid JSON format") + await websocket.send_text(f"Echo: {data['text']}") + + elif "bytes" in data: try: - await websocket.send_json({ - "type": "error", - "message": f"TTS failed: {e}" - }) - except WebSocketDisconnect: - pass - - # ---------------------------- - # 4) SPEECH-TO-TEXT (nếu bạn dùng binary audio frames) - # Client có thể gửi: { "type": "whisper", "audio_base64": "..." } - # (Hoặc kết hợp receive_bytes nếu không qua JSON) - # Server trả: { "type": "STT", "text": "", ... } - # ---------------------------- - elif msg_type == "whisper": - base64_audio = message.get("audio", "") - language = message.get("language","") - if not base64_audio: - try: - await websocket.send_json({ - "type": "error", - "message": "No audio_base64 field provided" - }) - except WebSocketDisconnect: - pass - continue - - try: - audio_bytes = base64.b64decode(base64_audio) - text = speech_to_text(audio_bytes,language) - logger.info(f"STT result: {text}") - await websocket.send_json({ - "type": "Whisper_result", - "text": str(text) - }) - except Exception as e: - logger.error(f"STT error: {e}") - try: - await websocket.send_json({ - "type": "error", - "message": f"Speech-to-text failed: {e}" - }) - except WebSocketDisconnect: - pass - - # ---------------------------- - # 5) PING/PONG giữ kết nối - # ---------------------------- - elif msg_type == "ping": - try: - await websocket.send_json({"type": "pong"}) - except WebSocketDisconnect: - break - - # ---------------------------- - # 6) Nếu type không được định nghĩa, trả về echo - # ---------------------------- - else: - try: - await websocket.send_json({ - "type": "echo", - "text": f"Unknown type: {msg_type}" - }) - except WebSocketDisconnect: - break + audio_data = data["bytes"] + + # Chuyển đổi âm thanh thành văn bản + text = speech_to_text(audio_data, language=lang) + logger.info( + f"Speech to Text with {lang}: {audio_data} -> {text}" + ) + # Gửi kết quả văn bản về phía client + await websocket.send_json( + { + "type": "STT", + "text": str(text), + } + ) + except Exception as e: + logger.error(f"STT error: {e}") + await websocket.send_json( + { + "type": "error", + "message": f"Speech-to-text failed: {str(e)}", + } + ) + except asyncio.TimeoutError: + logger.info("Timeout, sending ping") + await websocket.send_json({"type": "ping"}) + except WebSocketDisconnect: + logger.info("Client disconnected") + break except WebSocketDisconnect: - logger.info("Client disconnected (outer WebSocketDisconnect)") + logger.info("Client disconnected") return - except Exception as e: - # Bắt các lỗi bất ngờ khác - logger.error(f"Unexpected WebSocket error: {e}") - try: - await websocket.send_json({ - "type": "error", - "message": str(e) - }) - except: - pass - + logger.error(f"Error: {e}") + await websocket.send_json({"type": "error", "message": str(e)}) finally: - # Đóng WebSocket nếu chưa đóng - try: - await websocket.close() - except: - pass + logger.info("Closing WebSocket connection") + await websocket.close() -# Nếu chạy main.py trực tiếp: if __name__ == "__main__": import uvicorn - uvicorn.run("backend.main:app", host="0.0.0.0", port=8000, reload=True) + + uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) diff --git a/backend/models/image_translation.py b/backend/models/image_translation.py new file mode 100644 index 0000000..eabeeb5 --- /dev/null +++ b/backend/models/image_translation.py @@ -0,0 +1 @@ +# Model OCR diff --git a/backend/requirements.txt b/backend/requirements.txt index 61f532d..6048ff8 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -18,3 +18,9 @@ pymupdf pytesseract pillow sqlalchemy +pytesseract +pillow +opencv-python +transformers +torch + diff --git a/backend/services/image_translation_service.py b/backend/services/image_translation_service.py new file mode 100644 index 0000000..ac41bb3 --- /dev/null +++ b/backend/services/image_translation_service.py @@ -0,0 +1,50 @@ +import requests +import json +import base64 + + +class ImageTranslationService: + def __init__( + self, api_key: str, model: str = "google/gemini-2.5-flash-preview-05-20" + ): + self.api_key = api_key + self.model = model + self.url = "https://openrouter.ai/api/v1/chat/completions" + + def upload_image_to_imgbb(self, image_bytes: bytes, api_key: str) -> str: + url = "https://api.imgbb.com/1/upload" + b64_image = base64.b64encode(image_bytes).decode("utf-8") + payload = {"key": api_key, "image": b64_image} + response = requests.post(url, data=payload) + response.raise_for_status() + return response.json()["data"]["url"] + + def extract_sentences_with_positions(self, image_url: str) -> dict: + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "HTTP-Referer": "https://www.google.com", + "X-Title": "Google", + } + prompt = ( + "Analyze this image, split it into sentences or paragraphs. " + "Return a JSON object with key 'text' as a list, each element is a dict: " + "{'text': , 'x': , 'y': }" + ) + data = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + } + ], + } + response = requests.post(self.url, headers=headers, data=json.dumps(data)) + response.raise_for_status() + content = response.json()["choices"][0]["message"]["content"] + result = json.loads(content) + return result diff --git a/backend/services/punctuation.py b/backend/services/punctuation.py index 4f8e879..01d279b 100644 --- a/backend/services/punctuation.py +++ b/backend/services/punctuation.py @@ -18,9 +18,10 @@ model=punct_model, tokenizer=punct_tokenizer, aggregation_strategy="simple", - device=DEVICE, # <-- nếu có GPU, pipeline sẽ chạy trên GPU + device=DEVICE, # <-- nếu có GPU, pipeline sẽ chạy trên GPU ) + # 2. Hàm viết hoa sau dấu câu (unchanged) def capitalize_after_punctuation(text: str) -> str: text = text.strip() @@ -34,10 +35,11 @@ def capitalize_after_punctuation(text: str) -> str: def repl(match): return match.group(1) + match.group(2).upper() - pattern = r'([.?!]\s+)(\w)' + pattern = r"([.?!]\s+)(\w)" text = re.sub(pattern, repl, text) return text + # 3. Hàm phục hồi dấu câu def restore_punctuation(text: str) -> str: """ @@ -63,4 +65,3 @@ def restore_punctuation(text: str) -> str: # Ghép lại và xóa khoảng trắng dư return " ".join(result).strip() - diff --git a/backend/services/stt_service.py b/backend/services/stt_service.py index 8a12bf1..922a2de 100644 --- a/backend/services/stt_service.py +++ b/backend/services/stt_service.py @@ -2,7 +2,7 @@ import tempfile import subprocess import os -from punctuation import restore_punctuation +#from punctuation import restore_punctuation # Load mô hình Whisper (nhẹ và đủ tốt) From 5c932add05226eef750a002d7301b16b3d7d71cd Mon Sep 17 00:00:00 2001 From: Nguyen Viet Quyen Date: Wed, 4 Jun 2025 17:17:18 +0700 Subject: [PATCH 2/3] Refactor image translation: split blocks, return correct format, integrate translation --- .cursor/rules/translation-rule.mdc | 40 +++++ .gitignore | 6 + backend/api/image_routers.py | 72 +++++++- backend/services/draw_service.py | 39 ++++ backend/services/image_translation_service.py | 166 +++++++++++++----- requirements.txt | 13 +- test.py | 34 ++++ 7 files changed, 317 insertions(+), 53 deletions(-) create mode 100644 .cursor/rules/translation-rule.mdc create mode 100644 backend/services/draw_service.py create mode 100644 test.py diff --git a/.cursor/rules/translation-rule.mdc b/.cursor/rules/translation-rule.mdc new file mode 100644 index 0000000..663f38b --- /dev/null +++ b/.cursor/rules/translation-rule.mdc @@ -0,0 +1,40 @@ +--- +description: +globs: +alwaysApply: true +--- +Project: RealTime-Translator + +Technology stack: +- Backend: FastAPI (Python 3.11+) +- Real-time communication: WebSocket +- Audio processing: Speech-to-Text, Text-to-Speech +- Translation: MarianMT, OpenAI APIs, Google Cloud Speech, Whisper, gTTS +- Frontend: HTML/CSS/JavaScript (real-time interaction via WebSocket) + +Guidelines: + +1. Use FastAPI async endpoints and WebSocket routes for low latency. +2. Organize code into modular services: audio_processing, translation, websocket_handling, api_routes. +3. Use Pydantic models for request/response validation with type hints. +4. Integrate external AI/ML models and APIs with clear interface abstractions and fallback strategies. +5. Ensure thread-safe, async-friendly design especially for audio stream processing. +6. Follow OpenAPI standards; auto-generate docs from FastAPI annotations. +7. Use Python logging with appropriate levels (INFO, WARNING, ERROR) and structured logs for troubleshooting. +8. Implement security measures: input sanitization, WebSocket authentication, rate limiting. +9. Write unit and integration tests using pytest and httpx (or websockets testing tools). +10. Use environment variables or config files to manage API keys and sensitive data. +11. Optimize audio data handling to minimize latency and memory usage. +12. Frontend communication via WebSocket should support reconnect and error handling. +13. Document complex workflows, especially audio streaming and translation pipelines. +14. Code style: PEP 8 compliance, 4-space indentation, meaningful English names. +15. For every API or WebSocket handler, include error handling and clear response schemas. +16. Avoid blocking calls in async functions; use asyncio or thread pools as needed. +17. When suggesting code, include necessary import statements and type annotations. +18. All answers must be equal to ** Vietnamese **, unless otherwise required. +19. Do not arbitrarily edit or change my file in cursor if I ** has not allowed **. Should only give a specific code or suggestion with the explanation. +20. When there are many deployment options, please ** compare and explain advantages/disadvantages ** briefly and clearly. +21. Priority is a simple, effective, easy -to -expand and long -term solution. + + + diff --git a/.gitignore b/.gitignore index f1e0fb0..ff9dc1d 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,9 @@ Thumbs.db ehthumbs.db *.swp *.swo + + +.cursor +.qodo/ + + diff --git a/backend/api/image_routers.py b/backend/api/image_routers.py index 8a0a7dd..f27bcaf 100644 --- a/backend/api/image_routers.py +++ b/backend/api/image_routers.py @@ -1,19 +1,73 @@ -from fastapi import APIRouter, UploadFile, File +from fastapi import APIRouter, UploadFile, File, HTTPException from backend.services.image_translation_service import ImageTranslationService +from typing import Dict, Any +from backend.services.translation_service import translate_text +from backend.services.draw_service import DrawService router = APIRouter() + +# Initialize service with OpenRouter API key image_service = ImageTranslationService( - api_key="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" + api_key="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" ) -IMGBB_API_KEY = "785e57ea3a903daeaa39a49f3cc8bf38" +# Cloudinary API key +CLOUDINARY_API_KEY = "g9AIU2wXL2JtYavmHKukq0SgNTg" -@router.post("/test-ocr") -async def test_ocr(file: UploadFile = File(...)): +@router.post("/extract-text") +async def extract_text_from_image(file: UploadFile = File(...)) -> Dict[str, Any]: + """ + Extract text and positions from uploaded image + Args: + file: Uploaded image file + Returns: + Dict containing extracted text regions with positions + """ try: + # Validate file type + if not file.content_type.startswith('image/'): + raise HTTPException( + status_code=400, + detail="Invalid file type. Only images are allowed." + ) + + # Read image bytes image_bytes = await file.read() - image_url = image_service.upload_image_to_imgbb(image_bytes, IMGBB_API_KEY) - sentences = image_service.extract_sentences_with_positions(image_url) - return {"success": True, "sentences": sentences} + + # Upload to Cloudinary and get URL + image_url = image_service.upload_image_to_cloudinary( + image_bytes=image_bytes, + api_key=CLOUDINARY_API_KEY + ) + + # Extract text with positions + text_regions = image_service.extract_sentences_with_positions(image_url) + + # Translate text + for text_region in text_regions: + text_region['text'] = translate_text(text_region['text']) + + draw_service = DrawService(font_path="arial.ttf", font_size=16) + image_with_text = draw_service.draw_text_on_image(image_url, text_regions) + new_image_url = image_service.upload_image_to_cloudinary( + image_bytes=image_with_text, + api_key=CLOUDINARY_API_KEY + ) + + return { + "success": True, + "image_url": new_image_url, + "text_regions": text_regions, + "new_image_url": new_image_url + } + except Exception as e: - return {"success": False, "error": str(e)} + # Log the error (you might want to use proper logging) + print(f"Error processing image: {str(e)}") + + # Return appropriate error response + raise HTTPException( + status_code=500, + detail=f"Failed to process image: {str(e)}" + ) + diff --git a/backend/services/draw_service.py b/backend/services/draw_service.py new file mode 100644 index 0000000..3510806 --- /dev/null +++ b/backend/services/draw_service.py @@ -0,0 +1,39 @@ +import cv2 +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import requests +from typing import List, Dict + + +class DrawService: + def __init__(self, font_path: str = "arial.ttf", font_size: int = 16): + self.font_path = font_path + self.font_size = font_size + + def draw_text_on_image(self, image_url: str, text_regions: List[Dict]) -> bytes: + response = requests.get(image_url) + img_arr = np.asarray(bytearray(response.content), dtype=np.uint8) + image = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) + + image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + draw = ImageDraw.Draw(image_pil, "RGBA") + font = ImageFont.truetype(self.font_path, self.font_size) + + for region in text_regions: + x, y = region["x"], region["y"] + w, h = region["width"], region["height"] + translated = region.get("text", "") + draw.rectangle([x, y, x + w, y + h], fill=(255,255,255,180)) + bbox = draw.textbbox((0, 0), translated, font=font) + w_text = bbox[2] - bbox[0] + h_text = bbox[3] - bbox[1] + x_text = x + (w - w_text) // 2 + y_text = y + (h - h_text) // 2 + draw.text((x_text, y_text), translated, font=font, fill=(255,0,0,255)) + + image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR) + _, img_encoded = cv2.imencode('.jpg', image_cv) + return img_encoded.tobytes() + + + diff --git a/backend/services/image_translation_service.py b/backend/services/image_translation_service.py index ac41bb3..71e095c 100644 --- a/backend/services/image_translation_service.py +++ b/backend/services/image_translation_service.py @@ -1,50 +1,134 @@ import requests import json import base64 +from typing import List, Dict, Any +import os +from dotenv import load_dotenv +from datetime import datetime +import cloudinary +import cloudinary.uploader +from backend.services.translation_service import translate_text +# Load environment variables +load_dotenv() class ImageTranslationService: - def __init__( - self, api_key: str, model: str = "google/gemini-2.5-flash-preview-05-20" - ): + def __init__(self, api_key: str): self.api_key = api_key - self.model = model - self.url = "https://openrouter.ai/api/v1/chat/completions" - - def upload_image_to_imgbb(self, image_bytes: bytes, api_key: str) -> str: - url = "https://api.imgbb.com/1/upload" - b64_image = base64.b64encode(image_bytes).decode("utf-8") - payload = {"key": api_key, "image": b64_image} - response = requests.post(url, data=payload) - response.raise_for_status() - return response.json()["data"]["url"] - - def extract_sentences_with_positions(self, image_url: str) -> dict: - headers = { - "Authorization": f"Bearer {self.api_key}", + self.base_url = "https://openrouter.ai/api/v1" + self.headers = { + "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", - "HTTP-Referer": "https://www.google.com", - "X-Title": "Google", + "HTTP-Referer": os.getenv("SITE_URL", "http://localhost:8000"), + "X-Title": os.getenv("SITE_NAME", "LiveTranslator") } - prompt = ( - "Analyze this image, split it into sentences or paragraphs. " - "Return a JSON object with key 'text' as a list, each element is a dict: " - "{'text': , 'x': , 'y': }" - ) - data = { - "model": self.model, - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - } - ], - } - response = requests.post(self.url, headers=headers, data=json.dumps(data)) - response.raise_for_status() - content = response.json()["choices"][0]["message"]["content"] - result = json.loads(content) - return result + + def encode_image(self, image_path: str) -> str: + """Convert image to base64 string""" + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + def upload_image_to_cloudinary(self, image_bytes: bytes, api_key: str) -> str: + """ + Upload image to Cloudinary and return the URL + Args: + image_bytes: Image data in bytes + api_key: Cloudinary API key + Returns: + str: Public URL of the uploaded image + """ + try: + # Configure Cloudinary + cloudinary.config( + cloud_name="dw9bbrnke", + api_key="639987295942778", + api_secret="g9AIU2wXL2JtYavmHKukq0SgNTg" + ) + + # Generate unique filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"translation_{timestamp}" + + # Upload image + result = cloudinary.uploader.upload( + image_bytes, + public_id=filename, + resource_type="image", + overwrite=True + ) + + # Return secure URL + return result.get('secure_url') + + except Exception as e: + print(f"Error uploading to Cloudinary: {e}") + raise Exception(f"Failed to upload image: {str(e)}") + + def extract_sentences_with_positions(self, image_url: str) -> List[Dict[str, Any]]: + """ + Extract text and positions from image using OpenRouter API + Args: + image_url: URL of the image + Returns: + List of dictionaries containing text and positions + """ + try: + # Prepare the request payload + payload = { + "model": "google/gemini-2.5-flash-preview-05-20", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": """ + Hãy phân tích ảnh này và trích xuất tất cả đoạn text. + Với mỗi đoạn, trả về: + - text gốc + - x, y, width, height (tọa độ pixel tuyệt đối trên ảnh gốc, với (0,0) là góc trên bên trái, đơn vị là pixel) + Format kết quả là JSON array như sau: + [ + { + "type": "translation-image", + "text": "nội dung text", + "x": 123, + "y": 45, + "width": 67, + "height": 20 + } + ] + """ + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + } + ] + } + ] + } + + # Make the API request + response = requests.post( + url=f"{self.base_url}/chat/completions", + headers=self.headers, + json=payload + ) + + # Check for errors + response.raise_for_status() + + # Parse the response + result = response.json() + content = result['choices'][0]['message']['content'] + + # Extract JSON from the response text + json_str = content[content.find('['):content.rfind(']')+1] + return json.loads(json_str) + + except Exception as e: + print(f"Error extracting text: {e}") + return [] diff --git a/requirements.txt b/requirements.txt index bc7af5c..e7659e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,13 +4,20 @@ pydantic python-multipart httpx websockets +openai-whisper numpy aiofiles googletrans==4.0.0rc1 soundfile gtts -pydub +openai-whisper +pydub ffmpeg-python -rich +python-docx +pymupdf +pytesseract +pillow +sqlalchemy +cloudinary +openai python-dotenv -openai-whisper diff --git a/test.py b/test.py new file mode 100644 index 0000000..19ea9e2 --- /dev/null +++ b/test.py @@ -0,0 +1,34 @@ +import requests +import json + +response = requests.post( + url="https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": "Bearer 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", + "Content-Type": "application/json", + "HTTP-Referer": "https://www.google.com", # Optional. Site URL for rankings on openrouter.ai. + "X-Title": "Google", # Optional. Site title for rankings on openrouter.ai. + }, + data=json.dumps({ + "model": "google/gemini-2.0-flash-001", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Extract all text content with their exact coordinates from this image" + }, + { + "type": "image_url", + "image_url": { + "url": "https://res.cloudinary.com/dw9bbrnke/image/upload/v1749026020/translation_20250604_153336.jpg" + } + } + ] + } + ], + + }) +) +print(response.json()) \ No newline at end of file From 90d6de0d2b4bcf767208dd0cd629b994d0a09c3e Mon Sep 17 00:00:00 2001 From: Nguyen Viet Quyen <140420436+nguyenquyen1910@users.noreply.github.com> Date: Wed, 4 Jun 2025 20:48:46 +0700 Subject: [PATCH 3/3] Delete .cursor directory --- .cursor/rules/translation-rule.mdc | 40 ------------------------------ 1 file changed, 40 deletions(-) delete mode 100644 .cursor/rules/translation-rule.mdc diff --git a/.cursor/rules/translation-rule.mdc b/.cursor/rules/translation-rule.mdc deleted file mode 100644 index 663f38b..0000000 --- a/.cursor/rules/translation-rule.mdc +++ /dev/null @@ -1,40 +0,0 @@ ---- -description: -globs: -alwaysApply: true ---- -Project: RealTime-Translator - -Technology stack: -- Backend: FastAPI (Python 3.11+) -- Real-time communication: WebSocket -- Audio processing: Speech-to-Text, Text-to-Speech -- Translation: MarianMT, OpenAI APIs, Google Cloud Speech, Whisper, gTTS -- Frontend: HTML/CSS/JavaScript (real-time interaction via WebSocket) - -Guidelines: - -1. Use FastAPI async endpoints and WebSocket routes for low latency. -2. Organize code into modular services: audio_processing, translation, websocket_handling, api_routes. -3. Use Pydantic models for request/response validation with type hints. -4. Integrate external AI/ML models and APIs with clear interface abstractions and fallback strategies. -5. Ensure thread-safe, async-friendly design especially for audio stream processing. -6. Follow OpenAPI standards; auto-generate docs from FastAPI annotations. -7. Use Python logging with appropriate levels (INFO, WARNING, ERROR) and structured logs for troubleshooting. -8. Implement security measures: input sanitization, WebSocket authentication, rate limiting. -9. Write unit and integration tests using pytest and httpx (or websockets testing tools). -10. Use environment variables or config files to manage API keys and sensitive data. -11. Optimize audio data handling to minimize latency and memory usage. -12. Frontend communication via WebSocket should support reconnect and error handling. -13. Document complex workflows, especially audio streaming and translation pipelines. -14. Code style: PEP 8 compliance, 4-space indentation, meaningful English names. -15. For every API or WebSocket handler, include error handling and clear response schemas. -16. Avoid blocking calls in async functions; use asyncio or thread pools as needed. -17. When suggesting code, include necessary import statements and type annotations. -18. All answers must be equal to ** Vietnamese **, unless otherwise required. -19. Do not arbitrarily edit or change my file in cursor if I ** has not allowed **. Should only give a specific code or suggestion with the explanation. -20. When there are many deployment options, please ** compare and explain advantages/disadvantages ** briefly and clearly. -21. Priority is a simple, effective, easy -to -expand and long -term solution. - - -