-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpdf_scrapper
More file actions
43 lines (34 loc) · 1.43 KB
/
pdf_scrapper
File metadata and controls
43 lines (34 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pdfplumber
import re
import pandas as pd
# Function to extract data from PDF
def extract_nptel_data(pdf_path, csv_output):
extracted_text = []
# Open PDF and extract text from each page
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
extracted_text.append(page.extract_text())
full_text = "\n".join(extracted_text)
# Regex pattern to extract required details
pattern_with_roll = re.compile(
r"([A-Z ]+)\n" # Name
r"([A-Za-z ]+)\n" # Course Name
r"(\d+)\n" # Candidate Score
r"([\d.]+)/25 ([\d.]+)/75\n" # Online Assignments / Proctored Exam
r"\d+\n.+?\n.+?\nRoll No: (\S+)" # Roll No.
)
# Find all matches
matches = pattern_with_roll.findall(full_text)
# Store extracted data
data = []
for match in matches:
name, course, score, assignments, exam, roll_no = match
data.append([name.strip(), course.strip(), score, assignments, exam, roll_no])
# Convert to DataFrame and save to CSV
df = pd.DataFrame(data, columns=["Name", "Course Name", "Candidate Score", "Online Assignments", "Proctored Exam", "Roll No"])
df.to_csv(csv_output, index=False)
print(f"Data extracted and saved to {csv_output}")
# Example usage
pdf_path = "nptel_cert.pdf" # Change this to your file path
csv_output = "nptel_certificates_with_roll.csv"
extract_nptel_data(pdf_path, csv_output)