From 78f858a87596a883afe6c0610fce3426c76ed86d Mon Sep 17 00:00:00 2001
From: xcffl <--list>
Date: Mon, 2 Mar 2020 22:08:38 +0800
Subject: [PATCH] Fix & update Douban API

---
 src/calibre/ebooks/metadata/sources/douban.py | 304 +++++++++---------
 1 file changed, 154 insertions(+), 150 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py
index bb044f7cca..67ec87ee49 100644
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@@ -3,7 +3,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-__license__   = 'GPL v3'
+__license__ = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
 __docformat__ = 'restructuredtext en'
 
@@ -14,27 +14,26 @@ try:
 except ImportError:
     from Queue import Empty, Queue
 
-
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Option, Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre import as_unicode
 
 NAMESPACES = {
-              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
-              'atom' : 'http://www.w3.org/2005/Atom',
-              'db': 'https://www.douban.com/xmlns/',
-              'gd': 'http://schemas.google.com/g/2005'
-            }
+    'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',
+    'atom': 'http://www.w3.org/2005/Atom',
+    'db': 'https://www.douban.com/xmlns/',
+    'gd': 'http://schemas.google.com/g/2005'
+}
 
 
 def get_details(browser, url, timeout):  # {{{
     try:
-        if Douban.DOUBAN_API_KEY and Douban.DOUBAN_API_KEY != '':
+        if Douban.DOUBAN_API_KEY:
             url = url + "?apikey=" + Douban.DOUBAN_API_KEY
         raw = browser.open_novisit(url, timeout=timeout).read()
     except Exception as e:
-        gc = getattr(e, 'getcode', lambda : -1)
+        gc = getattr(e, 'getcode', lambda: -1)
         if gc() != 403:
             raise
         # Douban is throttling us, wait a little
@@ -42,97 +41,73 @@ def get_details(browser, url, timeout):  # {{{
         raw = browser.open_novisit(url, timeout=timeout).read()
 
     return raw
+
+
 # }}}
 
 
 class Douban(Source):
 
     name = 'Douban Books'
-    author = 'Li Fanxi'
-    version = (2, 1, 2)
+    author = 'Li Fanxi, xcffl'
+    version = (3, 0, 0)
     minimum_calibre_version = (2, 80, 0)
 
-    description = _('Downloads metadata and covers from Douban.com. '
-            'Useful only for Chinese language books.')
+    description = _(
+        'Downloads metadata and covers from Douban.com. '
+        'Useful only for Chinese language books.'
+    )
 
     capabilities = frozenset(['identify', 'cover'])
-    touched_fields = frozenset(['title', 'authors', 'tags',
-        'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
-        'identifier:douban'])  # language currently disabled
+    touched_fields = frozenset([
+        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
+        'identifier:isbn', 'rating', 'identifier:douban'
+    ])  # language currently disabled
     supports_gzip_transfer_encoding = True
     cached_cover_url_is_reliable = True
 
-    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+    DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a'
+    DOUBAN_API_URL = 'https://api.douban.com/v2/book/search'
     DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'
 
     options = (
-        Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'),
-               _('Whether to append subtitle in the book title.')),
+        Option(
+            'include_subtitle_in_title', 'bool', True,
+            _('Include subtitle in book title:'),
+            _('Whether to append subtitle in the book title.')
+        ),
     )
 
     def to_metadata(self, browser, log, entry_, timeout):  # {{{
-        from lxml import etree
-        from calibre.ebooks.chardet import xml_to_unicode
         from calibre.utils.date import parse_date, utcnow
-        from calibre.utils.cleantext import clean_ascii_chars
-
-        XPath = partial(etree.XPath, namespaces=NAMESPACES)
-        entry          = XPath('//atom:entry')
-        entry_id       = XPath('descendant::atom:id')
-        title          = XPath('descendant::atom:title')
-        description    = XPath('descendant::atom:summary')
-        subtitle       = XPath("descendant::db:attribute[@name='subtitle']")
-        publisher      = XPath("descendant::db:attribute[@name='publisher']")
-        isbn           = XPath("descendant::db:attribute[@name='isbn13']")
-        date           = XPath("descendant::db:attribute[@name='pubdate']")
-        creator        = XPath("descendant::db:attribute[@name='author']")
-        booktag        = XPath("descendant::db:tag/attribute::name")
-        rating         = XPath("descendant::gd:rating/attribute::average")
-        cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")
-
-        def get_text(extra, x):
-            try:
-                ans = x(extra)
-                if ans:
-                    ans = ans[0].text
-                    if ans and ans.strip():
-                        return ans.strip()
-            except:
-                log.exception('Programming error:')
-            return None
 
-        id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
-        douban_id = id_url.split('/')[-1]
-        title_ = ': '.join([x.text for x in title(entry_)]).strip()
-        subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip()
-        if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0:
-            title_ = title_ + ' - ' + subtitle
-        authors = [x.text.strip() for x in creator(entry_) if x.text]
+        douban_id = entry_.get('id')
+        title = entry_.get('title')
+        description = entry_.get('summary')
+        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
+        publisher = entry_.get('publisher')
+        isbns = entry_.get('isbn13')  # ISBN11 is obsolute, use ISBN13
+        pubdate = entry_.get('pubdate')
+        authors = entry_.get('author')
+        book_tags = entry_.get('tags')
+        rating = entry_.get('rating')
+        cover_url = entry_.get('image')
+        series = entry_.get('series')
+
         if not authors:
             authors = [_('Unknown')]
-        if not id_url or not title:
+        if not douban_id or not title:
             # Silently discard this entry
             return None
 
-        mi = Metadata(title_, authors)
-        mi.identifiers = {'douban':douban_id}
-        try:
-            log.info(id_url)
-            raw = get_details(browser, id_url, timeout)
-            feed = etree.fromstring(
-                xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
-                parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
-            )
-            extra = entry(feed)[0]
-        except:
-            log.exception('Failed to get additional details for', mi.title)
-            return mi
-        mi.comments = get_text(extra, description)
-        mi.publisher = get_text(extra, publisher)
+        mi = Metadata(title, authors)
+        mi.identifiers = {'douban': douban_id}
+        mi.publisher = publisher
+        mi.comments = description
+        # mi.subtitle = subtitle
 
         # ISBN
-        isbns = []
-        for x in [t.text for t in isbn(extra)]:
+        for x in isbns:
             if check_isbn(x):
                 isbns.append(x)
         if isbns:
@@ -140,52 +115,45 @@ class Douban(Source):
         mi.all_isbns = isbns
 
         # Tags
-        try:
-            btags = [x for x in booktag(extra) if x]
-            tags = []
-            for t in btags:
-                atags = [y.strip() for y in t.split('/')]
-                for tag in atags:
-                    if tag not in tags:
-                        tags.append(tag)
-        except:
-            log.exception('Failed to parse tags:')
-            tags = []
-        if tags:
-            mi.tags = [x.replace(',', ';') for x in tags]
+        mi.tags = [tag['name'] for tag in book_tags]
 
         # pubdate
-        pubdate = get_text(extra, date)
         if pubdate:
             try:
                 default = utcnow().replace(day=15)
                 mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
             except:
-                log.error('Failed to parse pubdate %r'%pubdate)
+                log.error('Failed to parse pubdate %r' % pubdate)
 
         # Ratings
-        if rating(extra):
+        if rating:
             try:
-                mi.rating = float(rating(extra)[0]) / 2.0
+                mi.rating = float(rating['average']) / 2.0
             except:
                 log.exception('Failed to parse rating')
                 mi.rating = 0
 
         # Cover
         mi.has_douban_cover = None
-        u = cover_url(extra)
+        u = cover_url
         if u:
-            u = u[0].replace('/spic/', '/lpic/')
             # If URL contains "book-default", the book doesn't have a cover
             if u.find('book-default') == -1:
                 mi.has_douban_cover = u
+
+        # Series
+        if series:
+            mi.series = series['title']
+
         return mi
+
     # }}}
 
     def get_book_url(self, identifiers):  # {{{
         db = identifiers.get('douban', None)
         if db is not None:
-            return ('douban', db, self.DOUBAN_BOOK_URL%db)
+            return ('douban', db, self.DOUBAN_BOOK_URL % db)
+
     # }}}
 
     def create_query(self, log, title=None, authors=None, identifiers={}):  # {{{
@@ -193,9 +161,9 @@ class Douban(Source):
             from urllib.parse import urlencode
         except ImportError:
             from urllib import urlencode
-        SEARCH_URL = 'https://api.douban.com/book/subjects?'
-        ISBN_URL = 'https://api.douban.com/book/subject/isbn/'
-        SUBJECT_URL = 'https://api.douban.com/book/subject/'
+        SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&'
+        ISBN_URL = 'https://api.douban.com/v2/book/isbn/'
+        SUBJECT_URL = 'https://api.douban.com/v2/book/'
 
         q = ''
         t = None
@@ -208,16 +176,18 @@ class Douban(Source):
             q = subject
             t = 'subject'
         elif title or authors:
+
             def build_term(prefix, parts):
                 return ' '.join(x for x in parts)
+
             title_tokens = list(self.get_title_tokens(title))
             if title_tokens:
                 q += build_term('title', title_tokens)
-            author_tokens = list(self.get_author_tokens(authors,
-                    only_first_author=True))
+            author_tokens = list(
+                self.get_author_tokens(authors, only_first_author=True)
+            )
             if author_tokens:
-                q += ((' ' if q != '' else '') +
-                    build_term('author', author_tokens))
+                q += ((' ' if q != '' else '') + build_term('author', author_tokens))
             t = 'search'
         q = q.strip()
         if isinstance(q, type(u'')):
@@ -231,24 +201,40 @@ class Douban(Source):
             url = SUBJECT_URL + q
         else:
             url = SEARCH_URL + urlencode({
-                    'q': q,
-                    })
+                'q': q,
+            })
         if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
             if t == "isbn" or t == "subject":
                 url = url + "?apikey=" + self.DOUBAN_API_KEY
             else:
                 url = url + "&apikey=" + self.DOUBAN_API_KEY
         return url
+
     # }}}
 
-    def download_cover(self, log, result_queue, abort,  # {{{
-            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
+    def download_cover(
+        self,
+        log,
+        result_queue,
+        abort,  # {{{
+        title=None,
+        authors=None,
+        identifiers={},
+        timeout=30,
+        get_best_cover=False
+    ):
         cached_url = self.get_cached_cover_url(identifiers)
         if cached_url is None:
             log.info('No cached cover found, running identify')
             rq = Queue()
-            self.identify(log, rq, abort, title=title, authors=authors,
-                    identifiers=identifiers)
+            self.identify(
+                log,
+                rq,
+                abort,
+                title=title,
+                authors=authors,
+                identifiers=identifiers
+            )
             if abort.is_set():
                 return
             results = []
@@ -257,8 +243,11 @@ class Douban(Source):
                     results.append(rq.get_nowait())
                 except Empty:
                     break
-            results.sort(key=self.identify_results_keygen(
-                title=title, authors=authors, identifiers=identifiers))
+            results.sort(
+                key=self.identify_results_keygen(
+                    title=title, authors=authors, identifiers=identifiers
+                )
+            )
             for mi in results:
                 cached_url = self.get_cached_cover_url(mi.identifiers)
                 if cached_url is not None:
@@ -291,11 +280,18 @@ class Douban(Source):
             url = self.cached_identifier_to_cover_url(db)
 
         return url
+
     # }}}
 
-    def get_all_details(self, br, log, entries, abort,  # {{{
-            result_queue, timeout):
-        from lxml import etree
+    def get_all_details(
+        self,
+        br,
+        log,
+        entries,
+        abort,  # {{{
+        result_queue,
+        timeout
+    ):
         for relevance, i in enumerate(entries):
             try:
                 ans = self.to_metadata(br, log, i, timeout)
@@ -305,29 +301,31 @@ class Douban(Source):
                     for isbn in getattr(ans, 'all_isbns', []):
                         self.cache_isbn_to_identifier(isbn, db)
                     if ans.has_douban_cover:
-                        self.cache_identifier_to_cover_url(db,
-                                ans.has_douban_cover)
+                        self.cache_identifier_to_cover_url(db, ans.has_douban_cover)
                     self.clean_downloaded_metadata(ans)
                     result_queue.put(ans)
             except:
-                log.exception(
-                    'Failed to get metadata for identify entry:',
-                    etree.tostring(i))
+                log.exception('Failed to get metadata for identify entry:', i)
             if abort.is_set():
                 break
-    # }}}
-
-    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
-            identifiers={}, timeout=30):
-        from lxml import etree
-        from calibre.ebooks.chardet import xml_to_unicode
-        from calibre.utils.cleantext import clean_ascii_chars
 
-        XPath = partial(etree.XPath, namespaces=NAMESPACES)
-        entry          = XPath('//atom:entry')
+    # }}}
 
-        query = self.create_query(log, title=title, authors=authors,
-                identifiers=identifiers)
+    def identify(
+        self,
+        log,
+        result_queue,
+        abort,
+        title=None,
+        authors=None,  # {{{
+        identifiers={},
+        timeout=30
+    ):
+        import json
+
+        query = self.create_query(
+            log, title=title, authors=authors, identifiers=identifiers
+        )
         if not query:
             log.error('Insufficient metadata to construct query')
             return
@@ -335,45 +333,51 @@ class Douban(Source):
         try:
             raw = br.open_novisit(query, timeout=timeout).read()
         except Exception as e:
-            log.exception('Failed to make identify query: %r'%query)
+            log.exception('Failed to make identify query: %r' % query)
             return as_unicode(e)
         try:
-            parser = etree.XMLParser(recover=True, no_network=True)
-            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
-                strip_encoding_pats=True)[0], parser=parser)
-            entries = entry(feed)
+            entries = json.loads(raw)['books']
         except Exception as e:
             log.exception('Failed to parse identify results')
             return as_unicode(e)
         if not entries and identifiers and title and authors and \
                 not abort.is_set():
-            return self.identify(log, result_queue, abort, title=title,
-                    authors=authors, timeout=timeout)
-
+            return self.identify(
+                log,
+                result_queue,
+                abort,
+                title=title,
+                authors=authors,
+                timeout=timeout
+            )
         # There is no point running these queries in threads as douban
         # throttles requests returning 403 Forbidden errors
         self.get_all_details(br, log, entries, abort, result_queue, timeout)
 
         return None
+
     # }}}
 
 
 if __name__ == '__main__':  # tests {{{
     # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
-    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
-            title_test, authors_test)
-    test_identify_plugin(Douban.name,
-        [
-            (
-                {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
-                    'authors':['刘慈欣']},
-                [title_test('三体', exact=True),
-                    authors_test(['刘慈欣'])]
-            ),
-
-            (
-                {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
-                [title_test('Linux内核修炼之道', exact=False)]
-            ),
-    ])
+    from calibre.ebooks.metadata.sources.test import (
+        test_identify_plugin, title_test, authors_test
+    )
+    test_identify_plugin(
+        Douban.name, [
+            ({
+                'identifiers': {
+                    'isbn': '9787536692930'
+                },
+                'title': '三体',
+                'authors': ['刘慈欣']
+            }, [title_test('三体', exact=True),
+                authors_test(['刘慈欣'])]),
+            ({
+                'title': 'Linux内核修炼之道',
+                'authors': ['任桥伟']
+            }, [title_test('Linux内核修炼之道', exact=False)]),
+        ]
+    )
 # }}}
-- 
2.25.1