ccprocessor · drunkpig · Mar 18, 2025 · Mar 13, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,4 @@ coverage.xml
 
 llm_web_kit.egg-info/*
 .llm-web-kit.jsonc
+.llm-web-kit-pageclassify.jsonc
diff --git a/llm_web_kit/exception/exception.jsonc b/llm_web_kit/exception/exception.jsonc
@@ -142,6 +142,10 @@
     "CleanModelException": {
       "code": 46000000,
       "message": "Clean model exception"
+    },
+    "CleanModelUnsupportedLanguageException": {
+      "code": 46100000,
+      "message": "Clean model unsupported language exception"
     }
   }
 }
diff --git a/llm_web_kit/exception/exception.py b/llm_web_kit/exception/exception.py
@@ -358,3 +358,16 @@ def __init__(self, custom_message: str | None = None, error_code: int | None = N
         if error_code is None:
             error_code = ErrorMsg.get_error_code('Model', 'CleanModelException')
         super().__init__(custom_message, error_code)
+
+
+##############################################################################
+#
+#  Model Exceptions
+#
+##############################################################################
+class CleanModelUnsupportedLanguageException(CleanModelException):
+    """Exception raised for clean model unsupported language."""
+    def __init__(self, custom_message: str | None = None, error_code: int | None = None):
+        if error_code is None:
+            error_code = ErrorMsg.get_error_code('Model', 'CleanModelUnsupportedLanguageException')
+        super().__init__(custom_message, error_code)
diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py
@@ -103,7 +103,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
 
         return data_json
 
-    def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> (str, str):
+    def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> Tuple[str, str]:
         """从html文本中提取主要的内容.
 
         Args:
@@ -126,7 +126,6 @@ def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
             base_url (str): html文本的网页地址
             html_lst (List[Tuple[str,str]]): html文本
             raw_html (str): html文本
-
         Returns:
         """
 

diff --git a/llm_web_kit/html_layout_classify/classify-spot.sh b/llm_web_kit/html_layout_classify/classify-spot.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+command -v proxyoff >/dev/null 2>&1 && proxyoff
+command -v proxy_off >/dev/null 2>&1 && proxy_off
+
+
+function count_used_gpus(){
+    all_jobs=`squeue --me -p $1`
+
+    gpu_num=0
+    for name in $all_jobs
+    do
+        if [ "$(echo $name | grep "gpu:")" != "" ];then
+            num="${name//gpu:/}"
+            gpu_num=$((($gpu_num+$num)))
+        fi
+    done
+    echo $gpu_num
+}
+
+
+# 函数：获取当前用户所有处于PD状态的任务数量
+get_pd_count() {
+    squeue -u "$USER" -t PD -h |grep spot | wc -l
+}
+
+# 定义一个函数来计算 SPOT_USED 的总和
+calculate_total_spot_used() {
+    # 执行 svp list 并获取输出
+    local svp_output=$(svp list)
+
+    # 使用 awk 解析并计算 SPOT_USED 列的总和
+    local total_spot_used=$(echo "$svp_output" | awk '
+    NR == 1 {next} # 跳过标题行
+    {
+        sum += $6  # 假设 SPOT_USED 是第6列
+    }
+    END {
+        print sum
+    }')
+
+    # 返回结果
+    echo $total_spot_used
+}
+
+calculate_total_reserved_idle() {
+    # 执行 svp list 并获取输出
+    local svp_output=$(svp list)
+
+    #总和
+    local total_reserved_idle=$(echo "$svp_output" | awk '
+    NR == 1 {next}
+    {
+        sum += $5
+    }
+    END {
+        print sum
+    }')
+    # 返回结果
+    echo $total_reserved_idle
+}
+
+#######################################################################################
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --partation)
+            PARTATION="$2"
+            shift 2
+            ;;
+        --tag)
+            TAG="$2"
+            shift 2
+            ;;
+        --task-num)
+            TASK_NUM="$2"
+            shift 2
+            ;;
+        --debug)
+            DEBUG=1
+            shift 1
+            ;;
+        --server-addr)
+            SERVER_ADDR="$2"
+            shift 2
+            ;;
+        --result-save-dir)
+            RESULT_SAVE_DIR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+
+MAX_PENDING_JOBS=10 # 用户pending任务数量，不能超过这个值
+MAX_JOBS=1000 # 用户最大提交任务数量
+MY_NAME="${USER}" # 用户名
+
+MY_HOME=$(echo $HOME)
+SLURM_LOG_DIR=${MY_HOME}/slum-logs/${TAG}
+# 创建日志目录（如果不存在）
+mkdir -p ${SLURM_LOG_DIR}/logs
+mkdir -p ${SLURM_LOG_DIR}/error
+export SLURM_SUBMIT_DIR=${SLURM_LOG_DIR}
+export LLM_WEB_KIT_CFG_PATH=/share/xuchao/.llm-web-kit-pageclassify.jsonc
+TASK_NUM="${TASK_NUM:-1}"  # Default to 1 if not provided
+DEBUG="${DEBUG:-0}"
+
+# Check required arguments
+if [ -z "$PARTATION" ] || [ -z "$TAG" ]; then
+    echo "Usage: $0 --partation <partition_name> --tag <tag_name>"
+    exit 1
+fi
+
+# 核心思路是只要不超过最大的pending任务数量，就一直提交任务
+while true
+do
+    for partation in "${PARTATION[@]}"; do
+        PD_COUNT=$(get_pd_count)
+        spot_count=$(squeue -u ${MY_NAME}  | grep -i spot |wc -l)
+
+        if [ "$PD_COUNT" -lt "$MAX_PENDING_JOBS" ] && [ $spot_count -lt $MAX_JOBS ]; then
+            # 如果PD任务数小于最大限制，则提交新任务
+            # tt=$(date '+%Y-%m-%d %H:%M:%S')
+            # total_spot_used=$(calculate_total_spot_used)
+            # total_reserved_idle=$(calculate_total_reserved_idle)
+            # echo -e "check  $partation spot \n tt:$tt \n total_spot_used: $total_spot_used\n total_reserved_idle: $total_reserved_idle \n PD_COUNT: $PD_COUNT"
+            if [ $DEBUG -eq 1 ]; then
+                LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL  --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n${TASK_NUM} --gres=gpu:1   python main.py  ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
+            else
+                LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL  --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n ${TASK_NUM} --gres=gpu:1 --async  python main.py  ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
+            fi
+            echo "use ${partation} submit job succ, submit next job now..."
+            rm batchscript* 2>/dev/null
+        fi
+        break
+    done # for
+    sleep 20
+done # while
diff --git a/llm_web_kit/html_layout_classify/classify.sh b/llm_web_kit/html_layout_classify/classify.sh
@@ -0,0 +1,108 @@
+#! /bin/bash
+
+command -v proxyoff >/dev/null 2>&1 && proxyoff
+command -v proxy_off >/dev/null 2>&1 && proxy_off
+
+function count_used_gpus(){
+    all_jobs=`squeue --me -p $1`
+
+    gpu_num=0
+    for name in $all_jobs
+    do
+        if [ "$(echo $name | grep "gpu:")" != "" ];then
+            num="${name//gpu:/}"
+            gpu_num=$((($gpu_num+$num)))
+        fi
+    done
+    echo $gpu_num
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --partation)
+            PARTATION="$2"
+            shift 2
+            ;;
+        --max-job)
+            MAX_JOB_TOTAL="$2"
+            shift 2
+            ;;
+        --tag)
+            TAG="$2"
+            shift 2
+            ;;
+        --task-num)
+            TASK_NUM="$2"
+            shift 2
+            ;;
+        --debug)
+            DEBUG=1
+            shift 1
+            ;;
+        --result-save-dir)
+            RESULT_SAVE_DIR="$2"
+            shift 2
+            ;;
+        --server-addr)
+            SERVER_ADDR="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+
+
+MY_HOME=$(echo $HOME)
+MY_NAME="${USER}" # 用户名
+SLURM_LOG_DIR=${MY_HOME}/slum-logs/${TAG}
+# 创建日志目录（如果不存在）
+mkdir -p ${SLURM_LOG_DIR}/logs
+mkdir -p ${SLURM_LOG_DIR}/error
+export SLURM_SUBMIT_DIR=${SLURM_LOG_DIR}
+export LLM_WEB_KIT_CFG_PATH=/share/${MY_NAME}/.llm-web-kit-pageclassify.jsonc
+TASK_NUM="${TASK_NUM:-1}"  # Default to 1 if not provided
+DEBUG="${DEBUG:-0}"
+SERVER_ADDR="${SERVER_ADDR:-http://127.0.0.1:5000}"
+PYTHON=/share/${MY_NAME}/.conda/envs/webkitdev/bin/python
+
+
+# Check required arguments
+if [ -z "$PARTATION" ] || [ -z "$MAX_JOB_TOTAL" ] || [ -z "$TAG" ]; then
+    echo "Usage: $0 --partation <partition_name> --max-job <max_job_count> --tag <tag_name> --debug <debug_mode>"
+    exit 1
+fi
+
+
+submited_job_num=0 # 成功提交的任务数
+
+while [ $submited_job_num -lt $MAX_JOB_TOTAL ]
+do
+    used_gpu=($(count_used_gpus $PARTATION))  # 分区中自己已使用的GPU数
+    avai_gpu=$(svp list -p $PARTATION|grep $PARTATION | awk '{print $5}')  # 分区中可用的GPU数
+    echo -e "check partation $PARTATION \n used_gpu: $used_gpu\n avai_gpu: $avai_gpu"
+
+    if [ $avai_gpu -gt 0 ]; then
+        # 提交一个任务，睡眠
+        if [ $DEBUG -eq 1 ]; then
+            LOG_LEVEL=INFO  srun -p ${PARTATION} --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL  --error=${SLURM_LOG_DIR}/error/error_%j.err  --gres=gpu:1 -N 1 -n ${TASK_NUM}  ${PYTHON} main.py --server-addr ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
+        else
+
+            LOG_LEVEL=ERROR  srun -p ${PARTATION} --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err  --gres=gpu:1 --async -N 1 -n ${TASK_NUM}  ${PYTHON}  main.py --server-addr ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
+        fi
+        # TODO 判断任务是否提交成功
+        submited_job_num=$((submited_job_num+1))
+        sleep 2
+        echo "use ${PARTATION} submit job succ, submit next job now..."
+        rm batchscript* 2>/dev/null
+    else
+        echo "skip ${PARTATION}, used_GPU = ${used_gpu}, no available GPU"
+        sleep 2
+    fi
+
+done # while
+
+echo "任务提交完成"
diff --git a/llm_web_kit/html_layout_classify/html_layout_classify.md b/llm_web_kit/html_layout_classify/html_layout_classify.md
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,4 @@ coverage.xml

		llm_web_kit.egg-info/*
		.llm-web-kit.jsonc
		.llm-web-kit-pageclassify.jsonc