From f108fd30e171c5f1bae10eba6cc2bcb9a1121686 Mon Sep 17 00:00:00 2001 From: Lu Date: Fri, 28 Feb 2025 15:25:58 +0800 Subject: [PATCH] fix dumplicated translations of merged cells in word tables; fix ArrayFormula cell error in excel --- python/translate/common.py | 3 + python/translate/word.py | 165 +++++++++++++++++++++++++++---------- 2 files changed, 126 insertions(+), 42 deletions(-) diff --git a/python/translate/common.py b/python/translate/common.py index 34bb0e9..00224ed 100644 --- a/python/translate/common.py +++ b/python/translate/common.py @@ -5,6 +5,7 @@ import platform import subprocess from pathlib import Path +from openpyxl.worksheet.formula import ArrayFormula def is_all_punc(strings): if isinstance(strings, datetime.time): @@ -13,6 +14,8 @@ def is_all_punc(strings): return True elif isinstance(strings, (int, float, complex)): return True + elif isinstance(strings, (ArrayFormula)): + return True # print(type(strings)) chinese_punctuations=get_chinese_punctuation() for s in strings: diff --git a/python/translate/word.py b/python/translate/word.py index 486025e..de87fe9 100644 --- a/python/translate/word.py +++ b/python/translate/word.py @@ -124,6 +124,29 @@ def start(trans): translate.complete(trans,text_count,spend_time) return True +# 函数还有问题,不能很好地识别纵向单元格。待修改。 +def is_vertical_merge_continued(cell): + """ + 判断当前单元格是否为纵向合并中的续合单元格。 + + 原理: + - 对于纵向合并,首个单元格在其 元素中会标记 w:val="restart"; + 后续被合并的单元格则可能没有 w:val 属性、或其值为空,或标记为 "continue"。 + - 因此,如果找到 元素后,其 w:val 属性为 None、空字符串或等于 "continue" + (不区分大小写),则认为该单元格是续合单元格,返回 True;否则返回 False。 + """ + tc = cell._tc + tcPr = tc.tcPr + if tcPr is not None: + vMerge = tcPr.find(qn('w:vMerge')) + if vMerge is not None: + # 获取 w:val 属性值,注意有可能返回 None 或空字符串 + val = vMerge.get(qn('w:val')) + # 如果属性不存在、为空字符串或值为 "continue",则认为是续合单元格 + if val is None or val.strip() == '' or val.lower() == 'continue': + return True + return False + def read_paragraph_text(document, texts): for paragraph in document.paragraphs: @@ -135,33 +158,74 @@ def read_paragraph_text(document, texts): for footerparagraph in section.footer.paragraphs: read_run(footerparagraph.runs, texts) print("footerparagraph", footerparagraph.text) + # for table in document.tables: + # for row in table.rows: + # start_span=0 + # for cell in row.cells: + # read_cell_text(cell, texts) + # 处理表格中的单元格文本 for table in document.tables: for row in table.rows: - start_span=0 - for cell in row.cells: - read_cell_text(cell, texts) + current_col = 0 + cells = row.cells + # 横向合并的处理:使用 while 循环,根据 grid_span 跳过被合并的多余单元格 + while current_col < len(cells): + cell = cells[current_col] + # 获取 grid_span 值,默认为1 + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # 新增:判断是否为纵向合并的续合单元格,如果是则跳过处理 + if not is_vertical_merge_continued(cell): + read_cell_text(cell, texts) + current_col += grid_span + def write_paragraph_text(document, texts, text_count, onlyText): for paragraph in document.paragraphs: replace_paragraph_text(paragraph, texts, text_count, onlyText, False) + # 处理表格中的单元格文本 for table in document.tables: for row in table.rows: - for cell in row.cells: - write_paragraph_text(cell, texts, text_count, onlyText) + current_col = 0 + cells = row.cells + while current_col < len(cells): + cell = cells[current_col] + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # 只对非纵向续合单元格进行写入翻译文本 + if not is_vertical_merge_continued(cell): + write_paragraph_text(cell, texts, text_count, onlyText) + current_col += grid_span def write_both_new(document, texts, text_count, onlyText): for paragraph in document.paragraphs: replace_paragraph_text(paragraph, texts, text_count, onlyText, True) - section = document.sections[0] - for headerparagraph in section.header.paragraphs: - replace_paragraph_text(headerparagraph, texts, text_count,onlyText, True) - for footerparagraph in section.footer.paragraphs: - replace_paragraph_text(footerparagraph, texts, text_count,onlyText, True) + + # 下面的递归调用中,cell传参作为document,会在.sections的地方报错、翻译失败。 + try: + section = document.sections[0] + for headerparagraph in section.header.paragraphs: + replace_paragraph_text(headerparagraph, texts, text_count,onlyText, True) + for footerparagraph in section.footer.paragraphs: + replace_paragraph_text(footerparagraph, texts, text_count,onlyText, True) + except: + pass + # 处理表格中单元格的翻译文本写入 for table in document.tables: for row in table.rows: - for cell in row.cells: + current_col = 0 + cells = row.cells + while current_col < len(cells): + cell = cells[current_col] + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # if not is_vertical_merge_continued(cell): write_both_new(cell, texts, text_count, onlyText) + current_col += grid_span def read_cell_text(cell, texts): for index,paragraph in enumerate(cell.paragraphs): @@ -194,11 +258,11 @@ def read_rune_text(document, texts): for footerparagraph in section.footer.paragraphs: read_run(footerparagraph.runs, texts) # print(datetime.datetime.now()) - for table in document.tables: - for row in table.rows: - start_span=0 - for cell in row.cells: - read_cell_text(cell, texts) + # for table in document.tables: + # for row in table.rows: + # start_span=0 + # for cell in row.cells: + # read_cell_text(cell, texts) # start_span+=1 # # if start_span==cell.grid_span: # # start_span=0 @@ -209,7 +273,21 @@ def read_rune_text(document, texts): # if len(paragraph.hyperlinks)>0: # for hyperlink in paragraph.hyperlinks: - # read_run(hyperlink.runs, texts) + # read_run(hyperlink.runs, texts) + # 处理表格中的单元格文本 + for table in document.tables: + for row in table.rows: + current_col = 0 + cells = row.cells + while current_col < len(cells): + cell = cells[current_col] + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # 新增:只处理非纵向续合单元格,避免重复翻译 + if not is_vertical_merge_continued(cell): + read_cell_text(cell, texts) + current_col += grid_span def write_only_new(document, texts, text_count, onlyText): @@ -228,21 +306,20 @@ def write_only_new(document, texts, text_count, onlyText): write_run(headerparagraph.runs, texts) for footerparagraph in section.footer.paragraphs: write_run(footerparagraph.runs, texts) + # 处理表格中的单元格文本 for table in document.tables: for row in table.rows: - start_span=0 - for cell in row.cells: - write_cell_text(cell, texts) - # start_span+=1 - # if start_span==cell.grid_span: - # start_span=0 - # text_count+=write_cell(cell, texts) - # for paragraph in cell.paragraphs: - # text_count+=write_run(paragraph.runs, texts) - - # if len(paragraph.hyperlinks)>0: - # for hyperlink in paragraph.hyperlinks: - # text_count+=write_run(hyperlink.runs, texts) + current_col = 0 + cells = row.cells + while current_col < len(cells): + cell = cells[current_col] + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # 仅处理非纵向续合单元格 + if not is_vertical_merge_continued(cell): + write_cell_text(cell, texts) + current_col += grid_span #保留原译文 def write_rune_both(document, texts, text_count, onlyText,target_lang): @@ -267,20 +344,24 @@ def write_rune_both(document, texts, text_count, onlyText,target_lang): footerparagraph.runs[-1].add_break() add_paragraph_run(footerparagraph, footerparagraph.runs, texts, text_count,target_lang) # text_count+=write_run(paragraph.runs, texts) + # 处理表格中单元格的文本写入 for table in document.tables: for row in table.rows: - # start_span=0 - for cell in row.cells: - # start_span+=1 - # if start_span==cell.grid_span: - # start_span=0 - # text_count+=write_cell(cell, texts) - for paragraph in cell.paragraphs: - replace_paragraph_text(paragraph, texts, text_count, onlyText, True) - - if len(paragraph.hyperlinks)>0: - for hyperlink in paragraph.hyperlinks: - replace_paragraph_text(hyperlink, texts, text_count, onlyText, True) + current_col = 0 + cells = row.cells + while current_col < len(cells): + cell = cells[current_col] + tc = cell._tc + grid_span_elem = tc.tcPr.find(qn('w:gridSpan')) if tc.tcPr is not None else None + grid_span = int(grid_span_elem.get(qn('w:val'), '1')) if grid_span_elem is not None else 1 + # 仅处理非纵向续合单元格,避免重复写入译文 + if not is_vertical_merge_continued(cell): + for paragraph in cell.paragraphs: + replace_paragraph_text(paragraph, texts, text_count, onlyText, True) + if len(paragraph.hyperlinks) > 0: + for hyperlink in paragraph.hyperlinks: + replace_paragraph_text(hyperlink, texts, text_count, onlyText, True) + current_col += grid_span def read_run(runs,texts): # text=""