diff --git a/castrewinder/models.py b/castrewinder/models.py index c5ae12d..ee7cadc 100644 --- a/castrewinder/models.py +++ b/castrewinder/models.py @@ -3,7 +3,7 @@ class Feed(db.Model): __tablename__ = 'feed' id = db.Column(db.Integer, primary_key=True) - url = db.Column(db.String, unique = True, index=True) + url = db.Column(db.String, unique=True, index=True) etag = db.Column(db.String) last_modified = db.Column(db.String) last_published_element = db.Column(db.DateTime) @@ -19,6 +19,8 @@ class Episode(db.Model): published = db.Column(db.DateTime) content = db.Column(db.Text) feed_id = db.Column(db.Integer, db.ForeignKey('feed.id'), index=True) + enclosure_url = db.Column(db.Text) + enclosure_is_active = db.Column(db.Boolean, index=True) # Use cascade='delete,all' to propagate the deletion of a Feed onto its Employees feed = db.relationship('Feed', backref = db.backref('episodes', diff --git a/castrewinder/utils.py b/castrewinder/utils.py index be6be9f..c31cbb0 100644 --- a/castrewinder/utils.py +++ b/castrewinder/utils.py @@ -304,23 +304,25 @@ def build_xml_feed(feed_object, feed_entries, publication_dates, options, feed_f fe.content(content = strip_tags(content['value']) if 'value' in content else '', type = content['type'] if 'type' in content else '') - for media in episode.get('media_content', []): - if media.get('type') != 'application/x-shockwave-flash': - fe.enclosure(url = media.get('url', ''), - length = str(media.get('filesize', '')), - type = media.get('type', '')) - - for enclosure in episode.get('enclosure', []): - if enclosure.get('type') != 'application/x-shockwave-flash': - fe.enclosure(url = enclosure.get('url', ''), - length = str(enclosure.get('filesize', '')), - type = enclosure.get('type', '')) + if entry.enclosure_is_active: + # don't add an enclosure or media if the enclosure link is not active + for media in episode.get('media_content', []): + if media.get('type') != 'application/x-shockwave-flash': + fe.enclosure(url = media.get('url', ''), + length = str(media.get('filesize', '')), + type = media.get('type', '')) + + for enclosure in episode.get('enclosure', []): + if enclosure.get('type') != 'application/x-shockwave-flash': + fe.enclosure(url = enclosure.get('url', ''), + length = str(enclosure.get('filesize', '')), + type = enclosure.get('type', '')) link = episode.get('link', '') if link == '': link = feed.get('link') - if link == '': # This would be the perfect place + if link == '': # to link to a special HTML format feed. TODO. link = "%s#%s" % (request.url, "castrewinder_%s_%s" % (request.url, episode.get('id', ''))) @@ -329,10 +331,11 @@ def build_xml_feed(feed_object, feed_entries, publication_dates, options, feed_f for link in episode.get('links', []): if link.get('href','') != '': if link.get('rel') == 'enclosure': - links.append({'rel' : 'enclosure', - 'href' : link.get('href'), - 'type' : link.get('type', ''), - 'length': link.get('length', 0)}) + if entry.enclosure_is_active: + links.append({'rel' : 'enclosure', + 'href' : link.get('href'), + 'type' : link.get('type', ''), + 'length': link.get('length', 0)}) else: links.append({'rel' : 'alternate', 'href' : link.get('href'), diff --git a/feed_worker.py b/feed_worker.py index 66c36b2..71f305d 100644 --- a/feed_worker.py +++ b/feed_worker.py @@ -4,7 +4,7 @@ import calendar import time import json -from requests import get +from requests import get, head from dateutil import parser from urllib.parse import urlparse @@ -63,10 +63,21 @@ def add_entries_to_db(feed, feed_url, ignore_date = False): except: published = datetime.datetime.today() - if feed_object.last_published_element < published or ignore_date: + enclosure_url = get_enclosure_url_from_episode_content(content = entry) + + # # Calling the enclosure url status is too costly as of yet + # enclosure_status = get_url_status(url = enclosure_url) + # # If there was 301s, set last URL + # if enclosure_status[1] != '': + # enclosure_url = enclosure_status[1] + + if feed_object.last_published_element < published or ignore_date == True: new_entry = Episode(published = published, content = json.dumps(entry, default=json_serial), - feed_id = feed_object.id) + feed_id = feed_object.id, + enclosure_url = enclosure_url, + # enclosure_is_active = enclosure_status[0]) + enclosure_is_active = True) db.session.add(new_entry) if not ignore_date: @@ -494,6 +505,73 @@ def update_feeds(): return True +def verify_links(): + """ This goes through every podcast file link + and checks if it’s still available""" + + all_episodes = db.session.query(Episode).filter(Episode.enclosure_is_active == True).all() + + for episode in all_episodes: + + # If there's no enclosure_url specified + if not episode.enclosure_url: + # Gets the enclosure URL and sets in in DB + enclosure_url = get_enclosure_url_from_episode_content(content = json.loads(episode.content)) + episode.enclosure_url = enclosure_url + + enclosure_status = get_url_status(url = episode.enclosure_url) + + # If there was 301s, the second part of the tuple is defined, + # set it as the enclosure URL + if enclosure_status[1] != '': + episode.enclosure_url = enclosure_status[1] + + # Set status active/inactive in DB + episode.enclosure_is_active = enclosure_status[0] + + db.session.commit() + + return True + +def get_url_status(url): + # Gets the head of a request, and returns a tuple with 2 items: + # - False if anything other than 2xx-3xx + # - new URL if 301, '' if none + + try: + request_head = head(url, allow_redirects=True) + except Exception: + return (False, '') + + # check history for 301 + end_url = '' + history_codes = [resp.status_code for resp in reversed(request_head.history)] + if 301 in history_codes and 302 not in history_codes: + # the last occurence of 301 is the first index (bc history is reversed) + last_301 = history_codes.index(301) + end_url = request_head.url + + return (True, end_url) if request_head.status_code == 200 else (False, None) + + +def get_enclosure_url_from_episode_content(content): + # Traverses an episode content element for enclosures + # RSS (or JSON Feed) + for enclosure in reversed(content.get('enclosure', [])): + # Only get the LAST enclosure of the post (as per RSS recommendations) + if enclosure.get('type') != 'application/x-shockwave-flash': + return enclosure.get('url') + + # Atom + for link in content.get('links', []): + # Only get the first link[rel="enclosure"] of the post + if link.get('rel') == 'enclosure' \ + and link.get('type') != 'application/x-shockwave-flash': + return link.get('href') + + # if no and no link[rel="enclosure"], return False + return None + if __name__ == '__main__': parser = argparse.ArgumentParser(description='You can import feeds into Cast Rewinder.', @@ -503,6 +581,7 @@ def update_feeds(): parser.add_argument('-w','--which_feed',help='''Get a feed’s ID from URL''') parser.add_argument('--feed_info',help='''Get a feed’s info from ID''') parser.add_argument('-u','--update_feeds',help='''Updates all feeds''', action='store_true') + parser.add_argument('-l','--verify_links',help='''Check all podcast links''', action='store_true') args = parser.parse_args() @@ -523,5 +602,8 @@ def update_feeds(): if args.update_feeds: update_feeds() + if args.verify_links: + verify_links() + if not any(vars(args).values()): ask_for_url() \ No newline at end of file diff --git a/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py b/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py new file mode 100644 index 0000000..59ef210 --- /dev/null +++ b/migrations/versions/b5adf7fac7ab_added_attachements_info_for_episodes.py @@ -0,0 +1,39 @@ +"""Added enclosures info for episodes + +Revision ID: b5adf7fac7ab +Revises: 4251f6c2a939 +Create Date: 2018-07-19 00:25:39.920621 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'b5adf7fac7ab' +down_revision = '4251f6c2a939' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('episode', sa.Column('enclosure_is_active', sa.Boolean(), nullable=True)) + op.add_column('episode', sa.Column('enclosure_url', sa.Text(), nullable=True)) + op.create_index(op.f('ix_episode_enclosure_is_active'), 'episode', ['enclosure_is_active'], unique=False) + # ### end Alembic commands ### + + # Populate column with True + op.execute(""" + UPDATE episode + SET enclosure_is_active = 'true' + """) + op.alter_column('episode', 'enclosure_is_active', nullable=False) + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_episode_enclosure_is_active'), table_name='episode') + op.drop_column('episode', 'enclosure_url') + op.drop_column('episode', 'enclosure_is_active') + # ### end Alembic commands ###