-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_utils.py
More file actions
134 lines (106 loc) · 3.61 KB
/
pdf_utils.py
File metadata and controls
134 lines (106 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
A collection of utility functions for handling PDF files.
Requires:
comtypes
PyPDF2
progressbar
"""
from __future__ import unicode_literals, print_function
import os
from comtypes import client
import PyPDF2
import progressbar
# Only use TK when it's available
try:
from six.moves import tkinter
from six.moves.tkinter_filedialog import askdirectory
USE_TK = True
except ImportError:
USE_TK = False
COMTYPES_PDF_FORMAT = 17
class PDFError(Exception):
pass
def get_files_of_type(path, types):
"""
Gets all files in `path` that have extension `type`.
"""
if not isinstance(types, (list, tuple)):
types = tuple(types)
dirpath, dirnames, filenames = next(os.walk(path), (None, None, []))
filenames = [i for i in filenames if any(i.lower().endswith(j.lower()) for j in types)]
return filenames
def get_dir():
"""
GUI popup for directory selection.
Defaults to text input if Tkinter isn't available.
"""
if USE_TK:
window = tkinter.Tk()
window.withdraw()
target_dir = askdirectory()
window.destroy()
return os.path.normpath(target_dir)
else:
path = input('Enter path:')
target_dir = os.path.abspath(path)
if not os.path.isdir():
raise Exception('Path is not a valid directory.')
return target_dir
def check_tmp_path(tmp_path):
"""
Ensures that a path is an existing empty folder.
"""
if not os.path.isdir(tmp_path):
os.mkdir(tmp_path)
elif not os.listdir(tmp_path):
raise PDFError('Temporary directory must be empty.')
def word_to_pdf(path, tmp_path=None):
"""
Converts all Word docs in `path` to PDF, collected in a temporary folder.
If `tmp_path` is provided and it is an existing folder, it must be empty.
Otherwise the folder is created.
"""
if tmp_path is None:
tmp_path = os.path.join(path, '.tmp')
check_tmp_path(tmp_path)
# Use Word itself to 'save as' each file as a PDF via the comtypes library
word = client.CreateObject('Word.Application')
word.Visible = False
filenames = get_files_of_type(path, ('.doc', '.docx'))
bar = progressbar.ProgressBar(max_value=len(filenames))
for i, fn in enumerate(filenames):
doc = word.Documents.Open(os.path.join(path, fn))
out_fn = '{}.pdf'.format(fn.split('.')[0])
doc.SaveAs(os.path.join(tmp_path, out_fn), FileFormat=COMTYPES_PDF_FORMAT)
doc.close()
bar.update(i + 1)
word.Quit()
def merge_pdfs(path, out_path=None, use_outlines=False):
"""
Combines all PDF's found at `path` into a single document.
To set output filename and/or location use `out_path`
"""
print('\nCombining pages')
files = []
if out_path is None:
out_path = os.path.join(path, 'combined.pdf')
filenames = get_files_of_type(path, '.pdf')
merger = PyPDF2.PdfFileMerger()
bar = progressbar.ProgressBar(max_value=len(filenames), redirect_stdout=True)
for i, fn in enumerate(filenames):
bk_txt = fn.split('.')[0]
curr_path = os.path.join(path, fn)
# Purposefully NOT using `with`, see http://stackoverflow.com/q/6773631
f = open(curr_path, 'rb')
# Can't close input files until output file is saved.
# Instead, move to list to close later
files.append(f)
merger.append(f, bookmark=bk_txt, import_bookmarks=False)
bar.update(i + 1)
if use_outlines:
merger.setPageMode('/UseOutlines')
with open(out_path, 'wb') as f:
merger.write(f)
# Now we can close our files
for f in files:
f.close()