Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ coverage.xml

llm_web_kit.egg-info/*
.llm-web-kit.jsonc
.llm-web-kit-pageclassify.jsonc
4 changes: 4 additions & 0 deletions llm_web_kit/exception/exception.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@
"CleanModelException": {
"code": 46000000,
"message": "Clean model exception"
},
"CleanModelUnsupportedLanguageException": {
"code": 46100000,
"message": "Clean model unsupported language exception"
}
}
}
13 changes: 13 additions & 0 deletions llm_web_kit/exception/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,16 @@ def __init__(self, custom_message: str | None = None, error_code: int | None = N
if error_code is None:
error_code = ErrorMsg.get_error_code('Model', 'CleanModelException')
super().__init__(custom_message, error_code)


##############################################################################
#
# Model Exceptions
#
##############################################################################
class CleanModelUnsupportedLanguageException(CleanModelException):
"""Exception raised for clean model unsupported language."""
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
if error_code is None:
error_code = ErrorMsg.get_error_code('Model', 'CleanModelUnsupportedLanguageException')
super().__init__(custom_message, error_code)
3 changes: 1 addition & 2 deletions llm_web_kit/extractor/html/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:

return data_json

def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> (str, str):
def _extract_main_html(self, raw_html:str, base_url:str, page_layout_type:str) -> Tuple[str, str]:
"""从html文本中提取主要的内容.

Args:
Expand All @@ -126,7 +126,6 @@ def _extract_code(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
base_url (str): html文本的网页地址
html_lst (List[Tuple[str,str]]): html文本
raw_html (str): html文本

Returns:
"""

Expand Down
142 changes: 142 additions & 0 deletions llm_web_kit/html_layout_classify/classify-spot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/bin/bash

command -v proxyoff >/dev/null 2>&1 && proxyoff
command -v proxy_off >/dev/null 2>&1 && proxy_off


function count_used_gpus(){
all_jobs=`squeue --me -p $1`

gpu_num=0
for name in $all_jobs
do
if [ "$(echo $name | grep "gpu:")" != "" ];then
num="${name//gpu:/}"
gpu_num=$((($gpu_num+$num)))
fi
done
echo $gpu_num
}


# 函数:获取当前用户所有处于PD状态的任务数量
get_pd_count() {
squeue -u "$USER" -t PD -h |grep spot | wc -l
}

# 定义一个函数来计算 SPOT_USED 的总和
calculate_total_spot_used() {
# 执行 svp list 并获取输出
local svp_output=$(svp list)

# 使用 awk 解析并计算 SPOT_USED 列的总和
local total_spot_used=$(echo "$svp_output" | awk '
NR == 1 {next} # 跳过标题行
{
sum += $6 # 假设 SPOT_USED 是第6列
}
END {
print sum
}')

# 返回结果
echo $total_spot_used
}

calculate_total_reserved_idle() {
# 执行 svp list 并获取输出
local svp_output=$(svp list)

#总和
local total_reserved_idle=$(echo "$svp_output" | awk '
NR == 1 {next}
{
sum += $5
}
END {
print sum
}')
# 返回结果
echo $total_reserved_idle
}

#######################################################################################
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--partation)
PARTATION="$2"
shift 2
;;
--tag)
TAG="$2"
shift 2
;;
--task-num)
TASK_NUM="$2"
shift 2
;;
--debug)
DEBUG=1
shift 1
;;
--server-addr)
SERVER_ADDR="$2"
shift 2
;;
--result-save-dir)
RESULT_SAVE_DIR="$2"
shift 2
;;
*)
echo "Unknown argument: $1"
exit 1
;;
esac
done

MAX_PENDING_JOBS=10 # 用户pending任务数量,不能超过这个值
MAX_JOBS=1000 # 用户最大提交任务数量
MY_NAME="${USER}" # 用户名

MY_HOME=$(echo $HOME)
SLURM_LOG_DIR=${MY_HOME}/slum-logs/${TAG}
# 创建日志目录(如果不存在)
mkdir -p ${SLURM_LOG_DIR}/logs
mkdir -p ${SLURM_LOG_DIR}/error
export SLURM_SUBMIT_DIR=${SLURM_LOG_DIR}
export LLM_WEB_KIT_CFG_PATH=/share/xuchao/.llm-web-kit-pageclassify.jsonc
TASK_NUM="${TASK_NUM:-1}" # Default to 1 if not provided
DEBUG="${DEBUG:-0}"

# Check required arguments
if [ -z "$PARTATION" ] || [ -z "$TAG" ]; then
echo "Usage: $0 --partation <partition_name> --tag <tag_name>"
exit 1
fi

# 核心思路是只要不超过最大的pending任务数量,就一直提交任务
while true
do
for partation in "${PARTATION[@]}"; do
PD_COUNT=$(get_pd_count)
spot_count=$(squeue -u ${MY_NAME} | grep -i spot |wc -l)

if [ "$PD_COUNT" -lt "$MAX_PENDING_JOBS" ] && [ $spot_count -lt $MAX_JOBS ]; then
# 如果PD任务数小于最大限制,则提交新任务
# tt=$(date '+%Y-%m-%d %H:%M:%S')
# total_spot_used=$(calculate_total_spot_used)
# total_reserved_idle=$(calculate_total_reserved_idle)
# echo -e "check $partation spot \n tt:$tt \n total_spot_used: $total_spot_used\n total_reserved_idle: $total_reserved_idle \n PD_COUNT: $PD_COUNT"
if [ $DEBUG -eq 1 ]; then
LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n${TASK_NUM} --gres=gpu:1 python main.py ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
else
LOG_LEVEL=ERROR srun -p ${partation} --quotatype=spot --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err -N 1 -n ${TASK_NUM} --gres=gpu:1 --async python main.py ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
fi
echo "use ${partation} submit job succ, submit next job now..."
rm batchscript* 2>/dev/null
fi
break
done # for
sleep 20
done # while
108 changes: 108 additions & 0 deletions llm_web_kit/html_layout_classify/classify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#! /bin/bash

command -v proxyoff >/dev/null 2>&1 && proxyoff
command -v proxy_off >/dev/null 2>&1 && proxy_off

function count_used_gpus(){
all_jobs=`squeue --me -p $1`

gpu_num=0
for name in $all_jobs
do
if [ "$(echo $name | grep "gpu:")" != "" ];then
num="${name//gpu:/}"
gpu_num=$((($gpu_num+$num)))
fi
done
echo $gpu_num
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--partation)
PARTATION="$2"
shift 2
;;
--max-job)
MAX_JOB_TOTAL="$2"
shift 2
;;
--tag)
TAG="$2"
shift 2
;;
--task-num)
TASK_NUM="$2"
shift 2
;;
--debug)
DEBUG=1
shift 1
;;
--result-save-dir)
RESULT_SAVE_DIR="$2"
shift 2
;;
--server-addr)
SERVER_ADDR="$2"
shift 2
;;
*)
echo "Unknown argument: $1"
exit 1
;;
esac
done


MY_HOME=$(echo $HOME)
MY_NAME="${USER}" # 用户名
SLURM_LOG_DIR=${MY_HOME}/slum-logs/${TAG}
# 创建日志目录(如果不存在)
mkdir -p ${SLURM_LOG_DIR}/logs
mkdir -p ${SLURM_LOG_DIR}/error
export SLURM_SUBMIT_DIR=${SLURM_LOG_DIR}
export LLM_WEB_KIT_CFG_PATH=/share/${MY_NAME}/.llm-web-kit-pageclassify.jsonc
TASK_NUM="${TASK_NUM:-1}" # Default to 1 if not provided
DEBUG="${DEBUG:-0}"
SERVER_ADDR="${SERVER_ADDR:-http://127.0.0.1:5000}"
PYTHON=/share/${MY_NAME}/.conda/envs/webkitdev/bin/python


# Check required arguments
if [ -z "$PARTATION" ] || [ -z "$MAX_JOB_TOTAL" ] || [ -z "$TAG" ]; then
echo "Usage: $0 --partation <partition_name> --max-job <max_job_count> --tag <tag_name> --debug <debug_mode>"
exit 1
fi


submited_job_num=0 # 成功提交的任务数

while [ $submited_job_num -lt $MAX_JOB_TOTAL ]
do
used_gpu=($(count_used_gpus $PARTATION)) # 分区中自己已使用的GPU数
avai_gpu=$(svp list -p $PARTATION|grep $PARTATION | awk '{print $5}') # 分区中可用的GPU数
echo -e "check partation $PARTATION \n used_gpu: $used_gpu\n avai_gpu: $avai_gpu"

if [ $avai_gpu -gt 0 ]; then
# 提交一个任务,睡眠
if [ $DEBUG -eq 1 ]; then
LOG_LEVEL=INFO srun -p ${PARTATION} --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err --gres=gpu:1 -N 1 -n ${TASK_NUM} ${PYTHON} main.py --server-addr ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
else

LOG_LEVEL=ERROR srun -p ${PARTATION} --output=${SLURM_LOG_DIR}/logs/output_%j.out --export=ALL --error=${SLURM_LOG_DIR}/error/error_%j.err --gres=gpu:1 --async -N 1 -n ${TASK_NUM} ${PYTHON} main.py --server-addr ${SERVER_ADDR} --result-save-dir ${RESULT_SAVE_DIR}
fi
# TODO 判断任务是否提交成功
submited_job_num=$((submited_job_num+1))
sleep 2
echo "use ${PARTATION} submit job succ, submit next job now..."
rm batchscript* 2>/dev/null
else
echo "skip ${PARTATION}, used_GPU = ${used_gpu}, no available GPU"
sleep 2
fi

done # while

echo "任务提交完成"
29 changes: 0 additions & 29 deletions llm_web_kit/html_layout_classify/html_layout_classify.md

This file was deleted.

Loading