mypython/img_validator.py at main · tjsky/mypython · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import sys
import platform
from PIL import Image, ImageFile
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
from datetime import datetime
import jinja2

# 设置图像最大像素限制（PIL库默认是89,478,485，无法应对超高清图片）
Image.MAX_IMAGE_PIXELS = 200000000

# 报告的 HTML 模板
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>图片完整性检测</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 2em; }
        .summary { background: #f8f9fa; padding: 20px; border-radius: 5px; }
        table { width: 100%; border-collapse: collapse; margin-top: 20px; }
        th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
        tr:hover { background-color: #f5f5f5; }
        .critical { color: #dc3545; font-weight: bold; }
    </style>
</head>
<body>
    <h1>图片完整性检测</h1>
    <div class="summary">
        <p>扫描时间：{{ scan_time }}</p>
        <p>扫描目录：{{ scan_dir }}</p>
        <p class="critical">损坏文件：{{ corrupted_count }} / {{ total_files }}</p>
        <p>完成耗时：{{ duration }} 秒</p>
    </div>

    <h2>损坏文件详情</h2>
    <table>
        <thead>
            <tr>
                <th>文件路径</th>
                <th>错误信息</th>
            </tr>
        </thead>
        <tbody>
            {% for file in corrupted_files %}
            <tr>
                <td><a href="{{ file.path }}">🔗</a>{{ file.path }}</td>
                <td>{{ file.error }}</td>
            </tr>
            {% endfor %}
        </tbody>
    </table>
</body>
</html>
"""

# 适配unix与win全平台
def is_hidden_file(path):
    try:

        if not os.path.exists(path):
            return False

        name = os.path.basename(path)

        if name.startswith('.'):
            return True

        if platform.system() == 'Windows':
            attrs = os.stat(path).st_file_attributes
            return bool(attrs & 2)  # FILE_ATTRIBUTE_HIDDEN
        return False
    except Exception as e:
        print(f"隐藏检测异常 {path}: {str(e)}")
        return False

# 图片完整性检查
def validate_image_file(file_path):
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("error", category=UserWarning, module="PIL.TiffImagePlugin")

            with Image.open(file_path) as img:
            img.verify()
            img.load()
        return (file_path, None)

    except UserWarning as uw:
        return (file_path, f"元数据损坏: {str(uw)}")  # 将警告转为错误信息

    except Exception as e:
        return (file_path, str(e))

# 生成HTML检测报告
def generate_report(output_path, data):
    env = jinja2.Environment()
    template = env.from_string(HTML_TEMPLATE)
    html = template.render(**data)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html)

# 扫描所有图片
def scan_images(directory, max_workers=None):

    supported_ext = {'.jpg', '.jpeg', '.png', '.gif',
                    '.bmp', '.webp', '.tiff', '.heic'}

    image_files = []
    for root, dirs, files in os.walk(directory):

        dirs[:] = [d for d in dirs if not is_hidden_file(os.path.join(root, d))]

        for file in files:
            file_path = os.path.join(root, file)
            if is_hidden_file(file_path):
                continue

            ext = os.path.splitext(file)[1].lower()
            if ext in supported_ext:
                image_files.append(file_path)

    # 并行扫描
    corrupted = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(validate_image_file, path): path
                  for path in image_files}

        # 进度条显示
        with tqdm(total=len(image_files), desc="扫描进度", unit="file") as pbar:
            for future in as_completed(futures):
                path, error = future.result()
                if error:
                    corrupted.append({"path": path, "error": error})
                pbar.update(1)

    return corrupted, len(image_files)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("使用方法: python img_validator.py <目录路径> [并发数(留空使用CPU全部线程)]")
        sys.exit(1)

    target_dir = sys.argv[1]
    max_workers = int(sys.argv[2]) if len(sys.argv) > 2 else os.cpu_count()

    start_time = datetime.now()
    corrupted_files, total_files = scan_images(target_dir, max_workers)
    duration = round((datetime.now() - start_time).total_seconds(), 2)

    # 生成HTML报告
    report_data = {
        "scan_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
        "scan_dir": os.path.abspath(target_dir),
        "corrupted_count": len(corrupted_files),
        "total_files": total_files,
        "duration": duration,
        "corrupted_files": corrupted_files
    }

    report_path = os.path.join(os.getcwd(), "image_validation_report.html")
    generate_report(report_path, report_data)

    # 控制台输出
    print(f"\n扫描完成！耗时 {duration} 秒")
    print(f"检测文件总数: {total_files}")
    print(f"损坏文件数量: {len(corrupted_files)}")
    print(f"报告已生成: file://{report_path}")