From fdc0bdc116470d784ab790731c3b2817b9da53ed Mon Sep 17 00:00:00 2001 From: Joachim Date: Thu, 19 Jul 2018 12:32:30 +0200 Subject: [PATCH 1/5] Preparing changes with *a migration* --- castrewinder/models.py | 4 +- ...ab_added_attachements_info_for_episodes.py | 39 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py diff --git a/castrewinder/models.py b/castrewinder/models.py index c5ae12d..ee7cadc 100644 --- a/castrewinder/models.py +++ b/castrewinder/models.py @@ -3,7 +3,7 @@ class Feed(db.Model): __tablename__ = 'feed' id = db.Column(db.Integer, primary_key=True) - url = db.Column(db.String, unique = True, index=True) + url = db.Column(db.String, unique=True, index=True) etag = db.Column(db.String) last_modified = db.Column(db.String) last_published_element = db.Column(db.DateTime) @@ -19,6 +19,8 @@ class Episode(db.Model): published = db.Column(db.DateTime) content = db.Column(db.Text) feed_id = db.Column(db.Integer, db.ForeignKey('feed.id'), index=True) + enclosure_url = db.Column(db.Text) + enclosure_is_active = db.Column(db.Boolean, index=True) # Use cascade='delete,all' to propagate the deletion of a Feed onto its Employees feed = db.relationship('Feed', backref = db.backref('episodes', diff --git a/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py b/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py new file mode 100644 index 0000000..59ef210 --- /dev/null +++ b/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py @@ -0,0 +1,39 @@ +"""Added enclosures info for episodes + +Revision ID: b5adf7fac7ab +Revises: 4251f6c2a939 +Create Date: 2018-07-19 00:25:39.920621 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'b5adf7fac7ab' +down_revision = '4251f6c2a939' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('episode', sa.Column('enclosure_is_active', sa.Boolean(), nullable=True)) + op.add_column('episode', sa.Column('enclosure_url', sa.Text(), nullable=True)) + op.create_index(op.f('ix_episode_enclosure_is_active'), 'episode', ['enclosure_is_active'], unique=False) + # ### end Alembic commands ### + + # Populate column with True + op.execute(""" + UPDATE episode + SET enclosure_is_active = 'true' + """) + op.alter_column('episode', 'enclosure_is_active', nullable=False) + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_episode_enclosure_is_active'), table_name='episode') + op.drop_column('episode', 'enclosure_url') + op.drop_column('episode', 'enclosure_is_active') + # ### end Alembic commands ### From ed25b53c953ce2ebbfbba6eaac0b26cbb396dcac Mon Sep 17 00:00:00 2001 From: Joachim Date: Thu, 19 Jul 2018 12:38:40 +0200 Subject: [PATCH 2/5] Basic feature --- feed_worker.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/feed_worker.py b/feed_worker.py index 5bff105..ed7993f 100644 --- a/feed_worker.py +++ b/feed_worker.py @@ -4,7 +4,7 @@ import calendar import time import json -from requests import get +from requests import get, head from dateutil import parser from urllib.parse import urlparse @@ -376,12 +376,77 @@ def update_feeds(): return True +def verify_links(): + """ This goes through every podcast file link + and checks if it’s still available""" + + all_episodes = db.session.query(Episode).filter(Episode.enclosure_is_active == True).all() + + for episode in all_episodes: + + # If there's no enclosure_url specified + if not episode.enclosure_url: + # Gets the enclosure URL and sets in in DB + enclosure_url = get_enclosure_url_from_episode_content(content = json.loads(episode.content)) + episode.enclosure_url = enclosure_url + + enclosure_status = get_url_status(url = episode.enclosure_url) + + # If there was 301s, set last URL + if enclosure_status[1] != '': + episode.enclosure_url = enclosure_status[1] + + # Set status active/inactive in DB + episode.enclosure_is_active = enclosure_status[0] + + db.session.commit() + + return True + +def get_url_status(url): + # Gets the head of a request, and returns False if anything other than 2xx-3xx + + try: + request_head = head(url, allow_redirects=True) + except Exception: + return False + + # check history for 301 + end_url = '' + history_codes = [resp.status_code for resp in reversed(request_head.history)] + if 301 in history_codes and 302 not in history_codes: + # the last occurence of 301 is the first index (bc history is reversed) + last_301 = history_codes.index(301) + end_url = request_head.url + + return (True, end_url) if request_head.status_code == 200 else (False, None) + + +def get_enclosure_url_from_episode_content(content): + # Traverses an episode content element for enclosures + # RSS (or JSON Feed) + for enclosure in reversed(content.get('enclosure', [])): + # Only get the LAST enclosure of the post (as per RSS recommendations) + if enclosure.get('type') != 'application/x-shockwave-flash': + return enclosure.get('url') + + # Atom + for link in content.get('links', []): + # Only get the first link[rel="enclosure"] of the post + if link.get('rel') == 'enclosure' \ + and link.get('type') != 'application/x-shockwave-flash': + return link.get('href') + + # if no and no link[rel="enclosure"], return False + return None + if __name__ == '__main__': parser = argparse.ArgumentParser(description='You can import feeds into Cast Rewinder.', prog='Cast Rewinder') parser.add_argument('-f','--feed_url',help='''Specify an URL to import''') parser.add_argument('-u','--update_feeds',help='''Updates all feeds''', action='store_true') + parser.add_argument('-l','--verify_links',help='''Check all podcast links''', action='store_true') args = parser.parse_args() @@ -393,5 +458,8 @@ def update_feeds(): if args.update_feeds: update_feeds() + if args.verify_links: + verify_links() + if not any(vars(args).values()): ask_for_url() \ No newline at end of file From 85c02306a76398a5fe2bb649ab207ee50f2bcfc5 Mon Sep 17 00:00:00 2001 From: Joachim Date: Thu, 19 Jul 2018 19:54:25 +0200 Subject: [PATCH 3/5] Delete dead attachements --- castrewinder/utils.py | 39 +++++++++++++++++++++++++-------------- feed_worker.py | 13 ++++++++++++- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/castrewinder/utils.py b/castrewinder/utils.py index 605e96a..3898a7a 100644 --- a/castrewinder/utils.py +++ b/castrewinder/utils.py @@ -291,24 +291,35 @@ def build_xml_feed(feed_object, feed_entries, publication_dates, options, feed_f fe.content(content = strip_tags(content['value']) if 'value' in content else '', type = content['type'] if 'type' in content else '') - for media in episode.get('media_content', []): - if media.get('type') != 'application/x-shockwave-flash': - fe.enclosure(url = media.get('url'), - length = str(media.get('filesize')), - type = media.get('type')) + if entry.enclosure_is_active: + # don't add an enclosure or media if the enclosure link is not active + for media in episode.get('media_content', []): + if media.get('type') != 'application/x-shockwave-flash': + fe.enclosure(url = media.get('url'), + length = str(media.get('filesize')), + type = media.get('type')) + + for enclosure in episode.get('enclosure', []): + if enclosure.get('type') != 'application/x-shockwave-flash': + fe.enclosure(url = enclosure.get('url'), + length = str(enclosure.get('filesize')), + type = enclosure.get('type')) - for enclosure in episode.get('enclosure', []): - if enclosure.get('type') != 'application/x-shockwave-flash': - fe.enclosure(url = enclosure.get('url'), - length = str(enclosure.get('filesize')), - type = enclosure.get('type')) fe.link(href = episode.get('link', ''), rel = 'alternate') for link in episode.get('links', []): - fe.link(rel = link.get('rel', ''), - href = link.get('href', ''), - type = link.get('type', ''), - length = link.get('length', '')) + if entry.enclosure_is_active: + fe.link(rel = link.get('rel', ''), + href = link.get('href', ''), + type = link.get('type', ''), + length = link.get('length', '')) + else: + # don't add an enclosure if the enclosure link is not active + if link.get('rel', '') != 'enclosure': + fe.link(rel = link.get('rel', ''), + href = link.get('href', ''), + type = link.get('type', ''), + length = link.get('length', '')) if 'image' in episode and 'href' in episode['image']: image_url = episode['image']['href'] diff --git a/feed_worker.py b/feed_worker.py index ed7993f..d46c881 100644 --- a/feed_worker.py +++ b/feed_worker.py @@ -60,10 +60,21 @@ def add_entries_to_db(feed, feed_url, ignore_date = False): except TypeError: published = datetime.datetime.today() + enclosure_url = get_enclosure_url_from_episode_content(content = entry) + + # # Calling the enclosure url status is too costly as of yet + # enclosure_status = get_url_status(url = enclosure_url) + # # If there was 301s, set last URL + # if enclosure_status[1] != '': + # enclosure_url = enclosure_status[1] + if feed_object.last_published_element < published or ignore_date == True: new_entry = Episode(published = published, content = json.dumps(entry, default=json_serial), - feed_id = feed_object.id) + feed_id = feed_object.id, + enclosure_url = enclosure_url, + # enclosure_is_active = enclosure_status[0]) + enclosure_is_active = True) db.session.add(new_entry) if not ignore_date: From 339f995309982eac01418faa0d692f1c62385088 Mon Sep 17 00:00:00 2001 From: Joachim Date: Wed, 1 Aug 2018 11:22:54 +0200 Subject: [PATCH 4/5] Content-type to Last-Modified (1) --- feed_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feed_worker.py b/feed_worker.py index 95be324..617d4d9 100644 --- a/feed_worker.py +++ b/feed_worker.py @@ -237,7 +237,7 @@ def import_feed(url, ignore_date = False): else: # Don't populate the Feed Table if it already contains the feed - response_headers = (response.headers.get('ETag', None), response.headers.get('Content-Type', None)) + response_headers = (response.headers.get('ETag', None), response.headers.get('Last-Modified', None)) add_feed_to_db(feed = feed, feed_url = feed_url, response_headers = response_headers) # Populate the Episode Table From c13dba9b1285557ebfe72bbac0c132b01141ddd3 Mon Sep 17 00:00:00 2001 From: Joachim Robert Date: Wed, 24 Oct 2018 17:14:18 +0200 Subject: [PATCH 5/5] Fix 301s --- feed_worker.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/feed_worker.py b/feed_worker.py index 617d4d9..b2d7887 100644 --- a/feed_worker.py +++ b/feed_worker.py @@ -413,7 +413,8 @@ def verify_links(): enclosure_status = get_url_status(url = episode.enclosure_url) - # If there was 301s, set last URL + # If there was 301s, the second part of the tuple is defined, + # set it as the enclosure URL if enclosure_status[1] != '': episode.enclosure_url = enclosure_status[1] @@ -425,12 +426,14 @@ def verify_links(): return True def get_url_status(url): - # Gets the head of a request, and returns False if anything other than 2xx-3xx + # Gets the head of a request, and returns a tuple with 2 items: + # - False if anything other than 2xx-3xx + # - new URL if 301, '' if none try: request_head = head(url, allow_redirects=True) except Exception: - return False + return (False, '') # check history for 301 end_url = ''