calibre

Conversion Error - Fetch news from Pocket

Bug #1537521 reported by Michael Bryce on 2016-01-24

This bug affects 1 person

Affects		Status	Importance	Assigned to	Milestone
	calibre	Won't Fix	Undecided	darkom

Bug Description

Problem description: Failed to Fetch new from Pocket. Tried restarting Calibre, re-adding the Pocket recipe as Custom-no effect.

INPUT
"""
Pocket Calibre Recipe v1.4
"""
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from string import Template
import json
import operator
import re
import tempfile
import urllib
import urllib2

__license__ = 'GPL v3'
__copyright__ = '''
2010, Darko Miletic <darko.miletic at gmail.com>
2011, Przemyslaw Kryger <pkryger at gmail.com>
2012-2013, tBunnyMan <Wag That Tail At Me dot com>
'''

class Pocket(BasicNewsRecipe):
    title = 'Pocket'
    __author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
    description = '''Personalized news feeds. Go to getpocket.com to setup up
                  your news. This version displays pages of articles from
                  oldest to newest, with max & minimum counts, and marks
                  articles read after downloading.'''
    publisher = 'getpocket.com'
    category = 'news, custom'

    # Settings people change
    oldest_article = 7.0
    max_articles_per_feed = 50
    minimum_articles = 10
    mark_as_read_after_dl = True # Set this to False for testing
    sort_method = 'oldest' # MUST be either 'oldest' or 'newest'
    # To filter by tag this needs to be a single tag in quotes; IE 'calibre'
    only_pull_tag = None

    # You don't want to change anything under
    no_stylesheets = True
    use_embedded_content = False
    needs_subscription = True
    articles_are_obfuscated = True
    apikey = '19eg0e47pbT32z4793Tf021k99Afl889'
    index_url = u'https://getpocket.com'
    read_api_url = index_url + u'/v3/get'
    modify_api_url = index_url + u'/v3/send'
    legacy_login_url = index_url + u'/l' # We use this to cheat oAuth
    articles = []

    def get_browser(self, *args, **kwargs):
        """
        We need to pretend to be a recent version of safari for the mac to
        prevent User-Agent checks Pocket api requires username and password so
        fail loudly if it's missing from the config.
        """
        br = BasicNewsRecipe.get_browser(self,
                user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; \
                        en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) \
                        Version/5.0.3 Safari/533.19.4')
        if self.username is not None and self.password is not None:
            br.open(self.legacy_login_url)
            br.select_form(nr=0)
            br['feed_id'] = self.username
            br['password'] = self.password
            br.submit()
        else:
            self.user_error("This Recipe requires authentication")
        return br

    def get_auth_uri(self):
        """Quick function to return the authentication part of the url"""
        uri = ""
        uri = u'{0}&apikey={1!s}'.format(uri, self.apikey)
        if self.username is None or self.password is None:
            self.user_error("Username or password is blank.")
        else:
            uri = u'{0}&username={1!s}'.format(uri, self.username)
            uri = u'{0}&password={1!s}'.format(uri, self.password)
        return uri

    def get_pull_articles_uri(self):
        uri = ""
        uri = u'{0}&state={1}'.format(uri, u'unread')
        uri = u'{0}&contentType={1}'.format(uri, u'article')
        uri = u'{0}&sort={1}'.format(uri, self.sort_method)
        uri = u'{0}&count={1!s}'.format(uri, self.max_articles_per_feed)
        if self.only_pull_tag is not None:
            uri = u'{0}&tag={1}'.format(uri, self.only_pull_tag)
        return uri

    def parse_index(self):
        pocket_feed = []
        fetch_url = u"{0}?{1}{2}".format(
            self.read_api_url,
            self.get_auth_uri(),
            self.get_pull_articles_uri()
        )
        try:
            request = urllib2.Request(fetch_url)
            response = urllib2.urlopen(request)
            pocket_feed = json.load(response)['list']
        except urllib2.HTTPError as e:
            self.log.exception("Pocket returned an error: {0}".format(e.info()))
            return []
        except urllib2.URLError as e:
            self.log.exception("Unable to connect to getpocket.com's api: {0}\nurl: {1}".format(e, fetch_url))
            return []

        if len(pocket_feed) < self.minimum_articles:
            self.mark_as_read_after_dl = False
            self.user_error("Only {0} articles retrieved, minimum_articles not reached".format(len(pocket_feed)))

        for pocket_article in pocket_feed.iteritems():
            self.articles.append({
                'item_id': pocket_article[0],
                'title': pocket_article[1]['resolved_title'],
                'date': pocket_article[1]['time_updated'],
                'url': u'{0}/a/read/{1}'.format(self.index_url, pocket_article[0]),
                'real_url': pocket_article[1]['resolved_url'],
                'description': pocket_article[1]['excerpt'],
                'sort': pocket_article[1]['sort_id']
            })
        self.articles = sorted(self.articles, key=operator.itemgetter('sort'))
        return [("My Pocket Articles for {0}".format(strftime('[%I:%M %p]')), self.articles)]

    def get_textview(self, url):
        """
        Since Pocket's v3 API they removed access to textview. They also
         redesigned their page to make it much harder to scrape their textview.
         We need to pull the article, retrieve the formcheck id, then use it
         to querty for the json version
        This function will break when pocket hates us
        """
        ajax_url = self.index_url + u'/a/x/getArticle.php'
        soup = self.index_to_soup(url)
        fc_tag = soup.find('script', text=re.compile("formCheck"))
        fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1)
        article_id = url.split("/")[-1]
        data = urllib.urlencode({'itemId': article_id, 'formCheck': fc_id})
        try:
            response = self.browser.open(ajax_url, data)
        except urllib2.HTTPError as e:
            self.log.exception("unable to get textview {0}".format(e.info()))
            raise e
        return json.load(response)['article']

    def get_obfuscated_article(self, url):
        """
        Our get_textview returns parsed json so prettify it to something well
        parsed by calibre.
        """
        article = self.get_textview(url)
        template = Template('<h1>$title</h1><div class="body">$body</div>')
        with tempfile.NamedTemporaryFile(delete=False) as tf:
            tmpbody = article['article']
            for img in article['images']:
                imgdiv = '<div id="RIL_IMG_{0}" class="RIL_IMG"></div>'.format(article['images'][img]['image_id'])
                imgtag = '<img src="{0}" \>'.format(article['images'][img]['src'])
                tmpbody = tmpbody.replace(imgdiv, imgtag)

            tf.write(template.safe_substitute(
                title=article['title'],
                body=tmpbody
                ))
        return tf.name

    def mark_as_read(self, mark_list):
        actions_list = []
        for article_id in mark_list:
            actions_list.append({
                'action': 'archive',
                'item_id': article_id
            })
        mark_read_url = u'{0}?actions={1}{2}'.format(
            self.modify_api_url,
            json.dumps(actions_list, separators=(',', ':')),
            self.get_auth_uri()
        )
        try:
            request = urllib2.Request(mark_read_url)
            urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            self.log.exception('Pocket returned an error while archiving articles: {0}'.format(e))
            return []
        except urllib2.URLError as e:
            self.log.exception("Unable to connect to getpocket.com's modify api: {0}".format(e))
            return []

    def cleanup(self):
        if self.mark_as_read_after_dl:
            self.mark_as_read([x['item_id'] for x in self.articles])
        else:
            pass

    def default_cover(self, cover_file):
        """
        Create a generic cover for recipes that don't have a cover
        This override adds time to the cover
        """
        try:
            from calibre.ebooks import calibre_cover
            title = self.title if isinstance(self.title, unicode) else \
                self.title.decode('utf-8', 'replace')
            date = strftime(self.timefmt)
            time = strftime('[%I:%M %p]')
            img_data = calibre_cover(title, date, time)
            cover_file.write(img_data)
            cover_file.flush()
        except:
            self.log.exception('Failed to generate default cover')
            return False
        return True

    def user_error(self, error_message):
        if hasattr(self, 'abort_recipe_processing'):
            self.abort_recipe_processing(error_message)
        else:
            self.log.exception(error_message)
            raise RuntimeError(error_message)

# vim:ft=python tabstop=8 expandtab shiftwidth=4 softtabstop=4

OUTPUT
calibre, version 2.49.0 (win32, isfrozen: True)
Conversion Error: Failed: Fetch news from Pocket

Fetch news from Pocket
Resolved conversion options
calibre version: 2.49.0
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': None,
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_download_recipe': False,
'dont_split_on_page_breaks': True,
'duplicate_links_in_toc': False,
'embed_all_fonts': False,
'embed_font_family': None,
'enable_heuristics': False,
'epub_flatten': False,
'epub_inline_toc': False,
'epub_toc_at_end': False,
'expand_css': False,
'extra_css': None,
'extract_to': None,
'filter_css': None,
'fix_indents': True,
'flow_size': 260,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x05D49110>,
'insert_blank_line': False,
'insert_blank_line_size': 0.5,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'no_chapters_in_toc': False,
'no_default_epub_cover': False,
'no_inline_navbars': False,
'no_svg_cover': False,
'output_profile': <calibre.customize.profiles.SonyReaderOutput object at 0x05D0C5F0>,
'page_breaks_before': None,
'prefer_metadata_cover': False,
'preserve_cover_aspect_ratio': False,
'pretty_print': True,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_fake_margins': True,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'search_replace': None,
'series': None,
'series_index': None,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'start_reading_at': None,
'subset_embedded_fonts': False,
'tags': None,
'test': False,
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'toc_title': None,
'unsmarten_punctuation': False,
'unwrap_lines': True,
'use_auto_toc': False,
'verbose': 2}
InputFormatPlugin: Recipe Input running
Using custom recipe
Python function terminated unexpectedly
  <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> (Error Code: 1)
Traceback (most recent call last):
  File "site.py", line 132, in main
  File "site.py", line 109, in run_entry_point
  File "site-packages\calibre\utils\ipc\worker.py", line 190, in main
  File "site-packages\calibre\gui2\convert\gui_conversion.py", line 25, in gui_convert
  File "site-packages\calibre\ebooks\conversion\plumber.py", line 1051, in run
  File "site-packages\calibre\customize\conversion.py", line 241, in __call__
  File "site-packages\calibre\ebooks\conversion\plugins\recipe_input.py", line 116, in convert
  File "site-packages\calibre\web\feeds\news.py", line 918, in __init__
  File "<string>", line 65, in get_browser
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_mechanize.py", line 203, in open
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_mechanize.py", line 230, in _mech_open
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_opener.py", line 193, in open
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_urllib2_fork.py", line 344, in _open
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_urllib2_fork.py", line 332, in _call_chain
  File "site-packages\calibre\utils\browser.py", line 25, in https_open
  File "site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_urllib2_fork.py", line 1118, in do_open
urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)>

Tags:

Revision history for this message

Kovid Goyal (kovid) wrote on 2016-01-25: Re: calibre bug 1537521

Changing the component for this bug.

assignee darko-miletic
status triaged

Changed in calibre:
assignee:	nobody → darkom (darko-miletic)
status:	New → Triaged

darkom (darko-miletic) on 2016-12-05

Changed in calibre:
status:	Triaged → Won't Fix

Report a bug

This report contains Public information

Everyone can see this information.

You are

Subscribing...

Edit bug mail

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.