Skip to content

Commit c304b78

Browse files
committed
fix #43 #44 libmagic bugs + linelength
1 parent 49ab68a commit c304b78

File tree

11 files changed

+126
-43
lines changed

11 files changed

+126
-43
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ dependencies = [
1919
"SQLAlchemy==2.0.43",
2020
"requests==2.32.3",
2121
"tqdm==4.67.1",
22+
"pylibmagic==0.5.0",
2223
"python-magic==0.4.27; sys_platform == 'linux'",
23-
"python-magic-bin==0.4.14; sys_platform == 'win32'",
24+
"python-magic-bin==0.4.14; sys_platform == 'win32' or sys_platform == 'darwin'",
2425
]
2526

2627
[project.scripts]

pywaybackup/Exception.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ def exception(cls, message: str, e: Exception, tb=None):
3636
codeline = linecache.getline(filename, tb_line).strip()
3737
local_vars = tb_frame.f_locals
3838
exception_message += (
39-
f"!-- File: {filename}\n"
40-
f"!-- Function: {func_name}\n"
41-
f"!-- Line: {tb_line}\n"
42-
f"!-- Segment: {codeline}\n"
39+
f"!-- File: {filename}\n!-- Function: {func_name}\n!-- Line: {tb_line}\n!-- Segment: {codeline}\n"
4340
)
4441
else:
4542
exception_message += "!-- Traceback is None\n"
@@ -96,4 +93,4 @@ def exception_handler(exception_type, exception, traceback):
9693
if issubclass(exception_type, KeyboardInterrupt):
9794
sys.__excepthook__(exception_type, exception, traceback)
9895
return
99-
Exception.exception('UNCAUGHT EXCEPTION', exception, traceback) # uncaught exceptions also with custom scheme
96+
Exception.exception("UNCAUGHT EXCEPTION", exception, traceback) # uncaught exceptions also with custom scheme

pywaybackup/PyWayBackup.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,11 @@ def paths(self, rel: bool = False) -> dict:
407407
"log": self._log,
408408
"debug": self._debug,
409409
}
410-
return {key: (os.path.relpath(path) if rel else path) for key, path in files.items() if path and os.path.exists(path)}
410+
return {
411+
key: (os.path.relpath(path) if rel else path)
412+
for key, path in files.items()
413+
if path and os.path.exists(path)
414+
}
411415

412416
def status(self) -> dict:
413417
"""

pywaybackup/Snapshot.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ def modify(self, column, value):
101101
value: New value to set for the column.
102102
"""
103103
column = getattr(waybackup_snapshots, column)
104-
self._db.session.execute(update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value}))
104+
self._db.session.execute(
105+
update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value})
106+
)
105107
self._db.session.commit()
106108

107109
def create_output(self):

pywaybackup/SnapshotCollection.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ def _write_summary(self):
5050

5151
def _reset_locked_snapshots(self):
5252
"""Reset locked snapshots to unprocessed in the database."""
53-
self.db.session.execute(update(waybackup_snapshots).where(waybackup_snapshots.response == "LOCK").values(response=None))
53+
self.db.session.execute(
54+
update(waybackup_snapshots).where(waybackup_snapshots.response == "LOCK").values(response=None)
55+
)
5456
self.db.session.commit()
5557

5658
def _finalize_db(self):
@@ -140,11 +142,19 @@ def _insert_batch_safe(line_batch):
140142
waybackup_snapshots.url_origin,
141143
waybackup_snapshots.url_archive,
142144
)
143-
.filter(tuple_(waybackup_snapshots.timestamp, waybackup_snapshots.url_origin, waybackup_snapshots.url_archive).in_(keys))
145+
.filter(
146+
tuple_(
147+
waybackup_snapshots.timestamp, waybackup_snapshots.url_origin, waybackup_snapshots.url_archive
148+
).in_(keys)
149+
)
144150
.all()
145151
)
146152
existing_rows = set(existing)
147-
new_rows = [row for row in unique_batch if (row["timestamp"], row["url_origin"], row["url_archive"]) not in existing_rows]
153+
new_rows = [
154+
row
155+
for row in unique_batch
156+
if (row["timestamp"], row["url_origin"], row["url_archive"]) not in existing_rows
157+
]
148158
if new_rows:
149159
self.db.session.bulk_insert_mappings(waybackup_snapshots, new_rows)
150160
self.db.session.commit()
@@ -200,17 +210,25 @@ def _index_snapshots(self):
200210
# index for filtering last snapshots
201211
if self._mode_last:
202212
idx1 = Index(
203-
"idx_waybackup_snapshots_url_origin_timestamp_desc", waybackup_snapshots.url_origin, waybackup_snapshots.timestamp.desc()
213+
"idx_waybackup_snapshots_url_origin_timestamp_desc",
214+
waybackup_snapshots.url_origin,
215+
waybackup_snapshots.timestamp.desc(),
204216
)
205217
idx1.create(self.db.session.bind, checkfirst=True)
206218
# index for filtering first snapshots
207219
if self._mode_first:
208220
idx2 = Index(
209-
"idx_waybackup_snapshots_url_origin_timestamp_asc", waybackup_snapshots.url_origin, waybackup_snapshots.timestamp.asc()
221+
"idx_waybackup_snapshots_url_origin_timestamp_asc",
222+
waybackup_snapshots.url_origin,
223+
waybackup_snapshots.timestamp.asc(),
210224
)
211225
idx2.create(self.db.session.bind, checkfirst=True)
212226
# index for skippable snapshots
213-
idx3 = Index("idx_waybackup_snapshots_timestamp_url_origin_response", waybackup_snapshots.timestamp, waybackup_snapshots.url_origin)
227+
idx3 = Index(
228+
"idx_waybackup_snapshots_timestamp_url_origin_response",
229+
waybackup_snapshots.timestamp,
230+
waybackup_snapshots.url_origin,
231+
)
214232
idx3.create(self.db.session.bind, checkfirst=True)
215233

216234
def _filter_snapshots(self):
@@ -224,7 +242,9 @@ def _filter_snapshots(self):
224242
def _filter_mode():
225243
self._filter_mode = 0
226244
if self._mode_last or self._mode_first:
227-
ordering = waybackup_snapshots.timestamp.desc() if self._mode_last else waybackup_snapshots.timestamp.asc()
245+
ordering = (
246+
waybackup_snapshots.timestamp.desc() if self._mode_last else waybackup_snapshots.timestamp.asc()
247+
)
228248
# assign row numbers per url_origin
229249
rownum = (
230250
func.row_number()
@@ -266,7 +286,9 @@ def _enumerate_counter():
266286

267287
_filter_mode()
268288
_enumerate_counter()
269-
self._filter_response = self.db.session.query(waybackup_snapshots).where(waybackup_snapshots.response.in_(["404", "301"])).count()
289+
self._filter_response = (
290+
self.db.session.query(waybackup_snapshots).where(waybackup_snapshots.response.in_(["404", "301"])).count()
291+
)
270292
self.db.session.commit()
271293

272294
def _skip_set(self):
@@ -280,7 +302,12 @@ def _skip_set(self):
280302
for row in f:
281303
self.db.session.execute(
282304
update(waybackup_snapshots)
283-
.where(and_(waybackup_snapshots.timestamp == row["timestamp"], waybackup_snapshots.url_origin == row["url_origin"]))
305+
.where(
306+
and_(
307+
waybackup_snapshots.timestamp == row["timestamp"],
308+
waybackup_snapshots.url_origin == row["url_origin"],
309+
)
310+
)
284311
.values(
285312
url_archive=row["url_archive"],
286313
redirect_url=row["redirect_url"],

pywaybackup/Verbosity.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ def progress(cls, progress: int, maxval: int = None):
6868
cls.pbar = Progressbar(
6969
unit=" snapshot",
7070
desc="download file".ljust(15),
71-
total=maxval, ascii="░▒█",
72-
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}"
73-
)
71+
total=maxval,
72+
ascii="░▒█",
73+
bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}",
74+
)
7475
if cls.pbar is not None and progress is not None and progress > 0:
7576
cls.pbar.update(progress)
7677

@@ -93,15 +94,30 @@ def filter_verbosity(cls, message: list):
9394

9495

9596
class Progressbar(Verbosity):
96-
def __init__(self, unit: str, desc: str, unit_scale: bool = False, total: int = None, ascii: str = None, bar_format: str = None):
97+
def __init__(
98+
self,
99+
unit: str,
100+
desc: str,
101+
unit_scale: bool = False,
102+
total: int = None,
103+
ascii: str = None,
104+
bar_format: str = None,
105+
):
97106
if not super().silent:
98107
self.unit = unit
99108
self.desc = desc
100109
self.unit_scale = unit_scale
101110
self.total = total
102111
self.ascii = ascii
103112
self.bar_format = bar_format
104-
self.pbar = tqdm(unit=self.unit, desc=self.desc, unit_scale=self.unit_scale, total=self.total, ascii=self.ascii, bar_format=self.bar_format)
113+
self.pbar = tqdm(
114+
unit=self.unit,
115+
desc=self.desc,
116+
unit_scale=self.unit_scale,
117+
total=self.total,
118+
ascii=self.ascii,
119+
bar_format=self.bar_format,
120+
)
105121

106122
def update(self, progress: int):
107123
"""

pywaybackup/archive_download.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,13 @@ def _download_loop(self, worker: Worker):
169169
try:
170170
download_status = self._download(worker=worker)
171171

172-
except (timeout, ConnectionRefusedError, ConnectionResetError, http.client.HTTPException, Exception) as e:
172+
except (
173+
timeout,
174+
ConnectionRefusedError,
175+
ConnectionResetError,
176+
http.client.HTTPException,
177+
Exception,
178+
) as e:
173179
if isinstance(e, (timeout, ConnectionRefusedError, ConnectionResetError)):
174180
if download_attempt < download_max_attempt:
175181
download_attempt += 1 # try again 2x with same connection
@@ -293,7 +299,10 @@ def _download(self, worker: Worker):
293299
try:
294300
context.response_data = gzip.decompress(context.response_data)
295301
except BadGzipFile:
296-
vb.write(verbose=None, content=f"Worker: {worker.id} - GZIP DECOMPRESS SKIPPED - {context.snapshot_url}")
302+
vb.write(
303+
verbose=None,
304+
content=f"Worker: {worker.id} - GZIP DECOMPRESS SKIPPED - {context.snapshot_url}",
305+
)
297306
pass
298307
file.write(context.response_data)
299308

@@ -313,7 +322,9 @@ def __handle_redirect(self, context: DownloadContext, worker: Worker) -> None:
313322
context (DownloadContext): The download context.
314323
worker (Worker): The worker instance.
315324
"""
316-
worker.message.store(verbose=True, result="REDIRECT", content=f"{context.response_status} {context.response_status_message}")
325+
worker.message.store(
326+
verbose=True, result="REDIRECT", content=f"{context.response_status} {context.response_status_message}"
327+
)
317328
worker.message.store(verbose=True, result="", info="FROM", content=context.snapshot_url)
318329
for _ in range(5):
319330
self.__download_response(context=context, worker=worker)
@@ -369,7 +380,9 @@ def __dl_result(self, context: DownloadContext, worker: Worker, result: str) ->
369380
Returns:
370381
bool: Always True (indicates result was processed).
371382
"""
372-
worker.message.store(verbose=True, result=result, content=f"{context.response_status} {context.response_status_message}")
383+
worker.message.store(
384+
verbose=True, result=result, content=f"{context.response_status} {context.response_status_message}"
385+
)
373386
worker.message.store(verbose=False, result=result)
374387
worker.message.store(verbose=True, result="", info="URL", content=context.snapshot_url)
375388
worker.message.store(verbose=True, result="", info="FILE", content=context.output_file)
@@ -386,7 +399,9 @@ def __dl_fail(self, context: DownloadContext, worker: Worker) -> bool:
386399
Returns:
387400
bool: Always False (indicates failure was processed).
388401
"""
389-
worker.message.store(verbose=None, result="UNKNOWN", content=f"{context.response_status} {context.response_status_message}")
402+
worker.message.store(
403+
verbose=None, result="UNKNOWN", content=f"{context.response_status} {context.response_status_message}"
404+
)
390405
worker.message.store(verbose=True, result="", info="URL", content=context.snapshot_url)
391406
return False
392407

pywaybackup/archive_save.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,24 @@ def save_page(url: str):
3434

3535
if response_status == 302:
3636
location = response.getheader("Location")
37-
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S")
37+
snapshot_timestamp = datetime.strptime(url_get_timestamp(location), "%Y%m%d%H%M%S").strftime(
38+
"%Y-%m-%d %H:%M:%S"
39+
)
3840
current_timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
3941
timestamp_difference = (
40-
datetime.strptime(current_timestamp, "%Y-%m-%d %H:%M:%S") - datetime.strptime(snapshot_timestamp, "%Y-%m-%d %H:%M:%S")
42+
datetime.strptime(current_timestamp, "%Y-%m-%d %H:%M:%S")
43+
- datetime.strptime(snapshot_timestamp, "%Y-%m-%d %H:%M:%S")
4144
).seconds / 60
4245
timestamp_difference = int(round(timestamp_difference, 0))
4346

4447
if timestamp_difference < 1:
4548
vb.write(verbose=None, content="\n-----> Response: 302 (new snapshot)")
4649
vb.write(verbose=None, content=f"SNAPSHOT URL: {location}")
4750
elif timestamp_difference >= 1:
48-
vb.write(verbose=None, content=f"\n-----> Response: 302 (existing snapshot - wait for {60 - timestamp_difference} minutes)")
51+
vb.write(
52+
verbose=None,
53+
content=f"\n-----> Response: 302 (existing snapshot - wait for {60 - timestamp_difference} minutes)",
54+
)
4955
vb.write(verbose=None, content=f"SNAPSHOT URL: {location}")
5056
vb.write(verbose=None, content=f"WAYBACK TIME: {snapshot_timestamp}")
5157
vb.write(verbose=None, content=f"REQUEST TIME: {current_timestamp}")

pywaybackup/db.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
)
1818
from sqlalchemy.ext.declarative import declarative_base
1919
from sqlalchemy.orm import sessionmaker
20-
from typing import Optional # python 3.8
20+
from typing import Optional # python 3.8
2121

2222
Base = declarative_base()
2323

@@ -113,7 +113,9 @@ def init(cls, dbfile, query_identifier):
113113
Base.metadata.create_all(engine)
114114

115115
db = Database()
116-
if db.session.execute(select(waybackup_job.query_identifier).where(query_identifier == query_identifier)).fetchone():
116+
if db.session.execute(
117+
select(waybackup_job.query_identifier).where(query_identifier == query_identifier)
118+
).fetchone():
117119
cls.query_exist = True
118120
cls.query_progress = db.get_progress()
119121
else:
@@ -140,7 +142,9 @@ def write_progress(self, done: int, total: int):
140142
"""
141143
progress = f"{(done):,} / {(total):,}"
142144
self.session.execute(
143-
update(waybackup_job).where(waybackup_job.query_identifier == self.query_identifier).values(query_progress=progress)
145+
update(waybackup_job)
146+
.where(waybackup_job.query_identifier == self.query_identifier)
147+
.values(query_progress=progress)
144148
)
145149
self.session.commit()
146150

@@ -180,19 +184,31 @@ def set_insert_complete(self):
180184
"""
181185
Mark the job's insertion phase as complete in the database.
182186
"""
183-
self.session.execute(update(waybackup_job).where(waybackup_job.query_identifier == self.query_identifier).values(insert_complete=1))
187+
self.session.execute(
188+
update(waybackup_job)
189+
.where(waybackup_job.query_identifier == self.query_identifier)
190+
.values(insert_complete=1)
191+
)
184192
self.session.commit()
185193

186194
def set_index_complete(self):
187195
"""
188196
Mark the job's indexing phase as complete in the database.
189197
"""
190-
self.session.execute(update(waybackup_job).where(waybackup_job.query_identifier == self.query_identifier).values(index_complete=1))
198+
self.session.execute(
199+
update(waybackup_job)
200+
.where(waybackup_job.query_identifier == self.query_identifier)
201+
.values(index_complete=1)
202+
)
191203
self.session.commit()
192204

193205
def set_filter_complete(self):
194206
"""
195207
Mark the job's filtering phase as complete in the database.
196208
"""
197-
self.session.execute(update(waybackup_job).where(waybackup_job.query_identifier == self.query_identifier).values(filter_complete=1))
209+
self.session.execute(
210+
update(waybackup_job)
211+
.where(waybackup_job.query_identifier == self.query_identifier)
212+
.values(filter_complete=1)
213+
)
198214
self.session.commit()

pywaybackup/files.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ def _build_query(self):
5454

5555
limit = f"&limit={self.limit}" if self.limit else ""
5656

57-
filter_statuscode = f"&filter=statuscode:({'|'.join(self.filter_statuscode)})$" if self.filter_statuscode else ""
57+
filter_statuscode = (
58+
f"&filter=statuscode:({'|'.join(self.filter_statuscode)})$" if self.filter_statuscode else ""
59+
)
5860
filter_filetype = f"&filter=original:.*\\.({'|'.join(self.filter_filetype)})$" if self.filter_filetype else ""
5961

6062
return f"https://web.archive.org/cdx/search/cdx?output=json&url={cdx_url}{period}&fl=timestamp,digest,mimetype,statuscode,original{limit}{filter_filetype}{filter_statuscode}"

0 commit comments

Comments
 (0)