forked from smc40/askyourdocs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilepreprocessing.py
More file actions
63 lines (45 loc) · 1.82 KB
/
filepreprocessing.py
File metadata and controls
63 lines (45 loc) · 1.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import PyPDF2
from nltk import word_tokenize
# given pdffile path returns list of text sections with overlap, based on wordcount
def pdf_get_text_chunks(file_path, chunk_size, overlap):
chunks = []
filename = os.path.basename(file_path) # Get the filename from the file path
# open pdf file based on path
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# extract all text of pdf
text = ""
for page_num in range(num_pages):
page = reader.pages[page_num]
text += page.extract_text()
# tokenize text to words
words = word_tokenize(text)
num_words = len(words)
# make chuncks with overlap of words
i = 0
while i < num_words:
chunk_end = min(i + chunk_size, num_words)
chunk = ' '.join(words[i:chunk_end])
chunks.append((filename, chunk))
i += chunk_size - overlap
return chunks
# calls pdf_get_text_chunks for entire folder returning filename and chunks
def pdf_folder_to_chunks(folder_path, chunk_size, overlap):
folder = []
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
file_path = os.path.join(folder_path, filename)
chunks = pdf_get_text_chunks(file_path, chunk_size, overlap)
folder.append(chunks)
return folder
# # Example usage 1 document and entire folder
# folder_path = "docs"
# file_path = f"{folder_path}/20211203_SwissPAR-Spikevax.pdf"
# chunk_size = 200
# overlap = 50
# text_chunks = pdf_get_text_chunks(file_path, chunk_size, overlap)
# all_pdfs = pdf_folder_to_chunks(folder_path, chunk_size, overlap)
# would like:
# preprocessing of text (punctation not included in wordcount etc) and potentially sections