-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
109 lines (94 loc) · 5.6 KB
/
app.py
File metadata and controls
109 lines (94 loc) · 5.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from pdf2image import convert_from_path
import easyocr
import difflib
import re
# Paths
pdf_path = "result.pdf"
poppler_path = r"C:\poppler\bin" # Specify the path to the Poppler binaries
output_dir = './images'
final_result_file = "final.txt"
extracted_text_file = "extracted.txt" # File for saving all extracted text
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Convert PDF to images
images = convert_from_path(pdf_path, poppler_path=poppler_path)
# Initialize EasyOCR Reader
reader = easyocr.Reader(['en']) # Specify the language(s)
# Dictionary to hold counts for each campus (for passed students)
campus_counts = {}
current_campus = "" # Tracks the current campus being processed
# Dictionary to store the seat capacity for each campus
seat_capacity = {
"Patan Multiple Campus": 144, "Amrit Campus": 144, "Bhaktapur Multiple Campus": 72,
"Padmakanya Multiple Campus": 72, "St. Xavier's College": 48, "Kathford International College of Engineering and Management": 48,
"New Summit College": 48, "Prime College": 48, "St. Lawrence College": 36,
"College of Applied Business and Technology": 36, "Kathmandu BernHardt College": 48,
"Deerwalk Institute of Technology": 48, "Vedas College": 48, "Texas International College": 48,
"Ambition College": 36, "National College of Computer Studies": 48, "Orchid International College": 48,
"Sagarmatha College of Science & Technology": 48, "Nagarjuna College of Information Technology": 36,
"Academia International College": 48, "Himalaya College of Engineering": 48,
"Asian School of Management and Technology": 48, "Madan Bhandari Memorial College": 48,
"Nepalaya College": 36, "Asian College of Higher Studies": 48, "Trinity International College": 48,
"Samriddhi College": 48, "Swastik College": 36, "Kathmandu College of Technology": 36, "NIST": 36,
"Siddhanath Science Campus": 72, "Ramswaru Ramsagar Multiple Campus": 72, "Mechi Multiple Campus": 36,
"Shreeyantra College": 48, "Central Campus of Technology": 36, "Birendra Memorial College": 48,
"Godawari College": 36, "Mahendra Morang Adarsh Multiple Campus": 72, "Birat Kshitiz College": 36,
"Nihareeka College": 36, "Birat Multiple College": 36, "AIMS College": 36,
"Himalaya Darshan College": 36, "National Infotech": 48, "Hetauda City College": 36,
"Birendra Multiple Campus": 72, "Chitwan College of Technology": 36, "Lumbini ICT Campus": 48,
"Prithvi Narayan Campus": 72, "Mount Annapurna Campus": 36, "Soch College of IT": 36,
"Butwal Multiple Campus": 72, "Lumbini City College": 36, "Nepathya College": 36,
"Bhairahawa Multiple Campus": 72, "Mahendra Multiple Campus, Nepalgunj": 72,
"Banke Bageshwori Campus, Nepalgunj": 36, "Nepalgunj Campus": 36, "Mahendra Multiple Campus, Dang": 36,
"Ambikeshwari Campus": 36
}
# Regular expression to find symbol numbers that start with "79" and are 7 digits long
symbol_pattern = re.compile(r"\b79\d{6}\b")
# Open file to write extracted text for review
with open(extracted_text_file, 'w') as extracted_file:
# Process each image
for i, image in enumerate(images):
# Save the image for potential debugging/visual review
image_path = os.path.join(output_dir, f'page-{i + 1}.png')
image.save(image_path, 'PNG')
print(f'Saved image: {image_path}')
# Perform OCR on the image
results = reader.readtext(image_path)
# Process each extracted text line
for (bbox, text, prob) in results:
print(f"Extracted Text: {text}") # Debugging output to see OCR results
extracted_file.write(f"{text}\n") # Write extracted text to a file for further review
# Attempt to find a matching campus from the list of known campuses
closest_match = difflib.get_close_matches(text, seat_capacity.keys(), n=1, cutoff=0.6)
if closest_match:
current_campus = closest_match[0]
print(f"Detected campus (best match): {current_campus}")
if current_campus not in campus_counts:
campus_counts[current_campus] = 0 # Initialize count for the new campus
else:
# Continue counting symbols for the previously detected campus
print(f"No campus match found for: {text}")
# Search for symbol numbers in the extracted text
symbol_numbers = symbol_pattern.findall(text)
if symbol_numbers and current_campus:
count = len(symbol_numbers)
campus_counts[current_campus] += count # Increment count for the current campus
print(f"Found {count} symbol numbers for campus: {current_campus}")
print(f"Updated count for {current_campus}: {campus_counts[current_campus]}")
# Ensure that campus counts are properly populated before writing to the final file
if campus_counts:
# Write the final campus counts and pass percentages to the result file
with open(final_result_file, 'w') as f:
for campus, passed_students in campus_counts.items():
total_seats = seat_capacity.get(campus, 0)
if total_seats > 0:
pass_percentage = (passed_students / total_seats) * 100
f.write(f'{campus}: {passed_students} ({pass_percentage:.2f}%)\n')
print(f'{campus}: {passed_students} ({pass_percentage:.2f}%)') # Display the result
else:
print("No campuses found to write to the file.")
# Summary
print(f'Counts and percentages saved to {final_result_file}')
print(f'All extracted text saved to {extracted_text_file}')