Skip to content

Commit 0dbec1b

Browse files
authored
enable miner-u on vllm image (#86)
* enable miner-u * update
1 parent 7dcc4cb commit 0dbec1b

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

vllm/docker/Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ RUN apt-get update -y && \
4040

4141
WORKDIR /llm
4242
COPY ./patches/vllm_for_multi_arc.patch /tmp/
43+
COPY ./patches/miner-u.patch /tmp/
4344

4445
# Set environment variables early
4546
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
@@ -57,6 +58,15 @@ RUN git clone -b v0.10.0 https://github.com/vllm-project/vllm.git && \
5758
export CPATH=/opt/intel/oneapi/dpcpp-ct/2025.1/include/:${CPATH} && \
5859
python3 setup.py install
5960

61+
# Clone + patch miner-U
62+
RUN git clone https://github.com/opendatalab/MinerU.git && \
63+
cd MinerU && \
64+
git checkout de41fa58590263e43b783fe224b6d07cae290a33 && \
65+
git apply /tmp/miner-u.patch && \
66+
pip install -e .[core] && \
67+
sed -i 's/select_device(self.args.device, verbose=verbose)/torch.device(self.args.device)/' /usr/local/lib/python3.12/dist-packages/ultralytics/engine/predictor.py
68+
69+
6070
# ======= Add oneCCL build =======
6171
# RUN apt-get update && apt-get install -y \
6272
# cmake \

vllm/patches/miner-u.patch

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
diff --git a/mineru/backend/pipeline/pipeline_analyze.py b/mineru/backend/pipeline/pipeline_analyze.py
2+
index de933059..6c421595 100644
3+
--- a/mineru/backend/pipeline/pipeline_analyze.py
4+
+++ b/mineru/backend/pipeline/pipeline_analyze.py
5+
@@ -125,7 +125,7 @@ def doc_analyze(
6+
f'Batch {index + 1}/{len(batch_images)}: '
7+
f'{processed_images_count} pages/{len(images_with_extra_info)} pages'
8+
)
9+
- batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
10+
+ batch_results = batch_image_analyze(batch_image, formula_enable, table_enable, len(images_with_extra_info))
11+
results.extend(batch_results)
12+
13+
# 构建返回结果
14+
@@ -149,7 +149,9 @@ def doc_analyze(
15+
def batch_image_analyze(
16+
images_with_extra_info: List[Tuple[PIL.Image.Image, bool, str]],
17+
formula_enable=True,
18+
- table_enable=True):
19+
+ table_enable=True,
20+
+ paths=0,
21+
+ ):
22+
# os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
23+
24+
from .batch_analyze import BatchAnalyze
25+
@@ -198,6 +200,15 @@ def batch_image_analyze(
26+
else:
27+
enable_ocr_det_batch = True
28+
29+
+ batch_ratio = 16
30+
+ min_path = int(os.getenv('MIN_ENABLE_OCR_DET_BATCH_PATH', 20))
31+
+ if paths >= min_path:
32+
+ enable_ocr_det_batch = True
33+
+ print(f"enable_ocr_det_batch: {enable_ocr_det_batch}")
34+
+
35+
+ batch_model = BatchAnalyze(model_manager, batch_ratio, formula_enable, table_enable, enable_ocr_det_batch)
36+
+
37+
+
38+
batch_model = BatchAnalyze(model_manager, batch_ratio, formula_enable, table_enable, enable_ocr_det_batch)
39+
results = batch_model(images_with_extra_info)
40+
41+
diff --git a/mineru/utils/config_reader.py b/mineru/utils/config_reader.py
42+
index f6d013ea..85a70ede 100644
43+
--- a/mineru/utils/config_reader.py
44+
+++ b/mineru/utils/config_reader.py
45+
@@ -79,6 +79,8 @@ def get_device():
46+
else:
47+
if torch.cuda.is_available():
48+
return "cuda"
49+
+ elif torch.xpu.is_available():
50+
+ return "xpu"
51+
elif torch.backends.mps.is_available():
52+
return "mps"
53+
else:

0 commit comments

Comments
 (0)