Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions Python/pdf_text_extractor/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from PyPDF2 import PdfReader

# Input and output folders
INPUT_FOLDER = "input"
OUTPUT_FOLDER = "output"

# Ensure output directory exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def extract_text_from_pdf(pdf_path):
"""Extract all text from a single PDF file."""
text = ""
try:
with open(pdf_path, "rb") as file:
reader = PdfReader(file)
for page_num, page in enumerate(reader.pages):
text += f"\n--- Page {page_num + 1} ---\n"
text += page.extract_text() or ""
except Exception as e:
print(f"Error reading {pdf_path}: {e}")
return text

def process_all_pdfs():
"""Read all PDFs in input folder and save text to output folder."""
pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]

if not pdf_files:
print("No PDF files found in 'pdfs/' folder.")
return

for pdf_file in pdf_files:
pdf_path = os.path.join(INPUT_FOLDER, pdf_file)
txt_filename = os.path.splitext(pdf_file)[0] + ".txt"
txt_path = os.path.join(OUTPUT_FOLDER, txt_filename)

print(f"Processing {pdf_file}...")
text = extract_text_from_pdf(pdf_path)

with open(txt_path, "w", encoding="utf-8") as txt_file:
txt_file.write(text)

print(f"Saved extracted text to {txt_path}")

if __name__ == "__main__":
process_all_pdfs()

Binary file not shown.
Binary file added Python/pdf_text_extractor/input/cacm12.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions Python/pdf_text_extractor/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PyPDF2
Loading