From 78f858a87596a883afe6c0610fce3426c76ed86d Mon Sep 17 00:00:00 2001 From: xcffl <--list> Date: Mon, 2 Mar 2020 22:08:38 +0800 Subject: [PATCH] Fix & update Douban API --- src/calibre/ebooks/metadata/sources/douban.py | 304 +++++++++--------- 1 file changed, 154 insertions(+), 150 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index bb044f7cca..67ec87ee49 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ; 2011, Li Fanxi ' __docformat__ = 'restructuredtext en' @@ -14,27 +14,26 @@ try: except ImportError: from Queue import Empty, Queue - from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Option, Source from calibre.ebooks.metadata.book.base import Metadata from calibre import as_unicode NAMESPACES = { - 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', - 'atom' : 'http://www.w3.org/2005/Atom', - 'db': 'https://www.douban.com/xmlns/', - 'gd': 'http://schemas.google.com/g/2005' - } + 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom': 'http://www.w3.org/2005/Atom', + 'db': 'https://www.douban.com/xmlns/', + 'gd': 'http://schemas.google.com/g/2005' +} def get_details(browser, url, timeout): # {{{ try: - if Douban.DOUBAN_API_KEY and Douban.DOUBAN_API_KEY != '': + if Douban.DOUBAN_API_KEY: url = url + "?apikey=" + Douban.DOUBAN_API_KEY raw = browser.open_novisit(url, timeout=timeout).read() except Exception as e: - gc = getattr(e, 'getcode', lambda : -1) + gc = getattr(e, 'getcode', lambda: -1) if gc() != 403: raise # Douban is throttling us, wait a little @@ -42,97 +41,73 @@ def get_details(browser, url, timeout): # {{{ raw = browser.open_novisit(url, timeout=timeout).read() return raw + + # }}} class Douban(Source): name = 'Douban Books' - author = 'Li Fanxi' - version = (2, 1, 2) + author = 'Li Fanxi, xcffl' + version = (3, 0, 0) minimum_calibre_version = (2, 80, 0) - description = _('Downloads metadata and covers from Douban.com. ' - 'Useful only for Chinese language books.') + description = _( + 'Downloads metadata and covers from Douban.com. ' + 'Useful only for Chinese language books.' + ) capabilities = frozenset(['identify', 'cover']) - touched_fields = frozenset(['title', 'authors', 'tags', - 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', - 'identifier:douban']) # language currently disabled + touched_fields = frozenset([ + 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', + 'identifier:isbn', 'rating', 'identifier:douban' + ]) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True - DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a' + DOUBAN_API_URL = 'https://api.douban.com/v2/book/search' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' options = ( - Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), - _('Whether to append subtitle in the book title.')), + Option( + 'include_subtitle_in_title', 'bool', True, + _('Include subtitle in book title:'), + _('Whether to append subtitle in the book title.') + ), ) def to_metadata(self, browser, log, entry_, timeout): # {{{ - from lxml import etree - from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow - from calibre.utils.cleantext import clean_ascii_chars - - XPath = partial(etree.XPath, namespaces=NAMESPACES) - entry = XPath('//atom:entry') - entry_id = XPath('descendant::atom:id') - title = XPath('descendant::atom:title') - description = XPath('descendant::atom:summary') - subtitle = XPath("descendant::db:attribute[@name='subtitle']") - publisher = XPath("descendant::db:attribute[@name='publisher']") - isbn = XPath("descendant::db:attribute[@name='isbn13']") - date = XPath("descendant::db:attribute[@name='pubdate']") - creator = XPath("descendant::db:attribute[@name='author']") - booktag = XPath("descendant::db:tag/attribute::name") - rating = XPath("descendant::gd:rating/attribute::average") - cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") - - def get_text(extra, x): - try: - ans = x(extra) - if ans: - ans = ans[0].text - if ans and ans.strip(): - return ans.strip() - except: - log.exception('Programming error:') - return None - id_url = entry_id(entry_)[0].text.replace('http://', 'https://') - douban_id = id_url.split('/')[-1] - title_ = ': '.join([x.text for x in title(entry_)]).strip() - subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip() - if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0: - title_ = title_ + ' - ' + subtitle - authors = [x.text.strip() for x in creator(entry_) if x.text] + douban_id = entry_.get('id') + title = entry_.get('title') + description = entry_.get('summary') + # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field + publisher = entry_.get('publisher') + isbns = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 + pubdate = entry_.get('pubdate') + authors = entry_.get('author') + book_tags = entry_.get('tags') + rating = entry_.get('rating') + cover_url = entry_.get('image') + series = entry_.get('series') + if not authors: authors = [_('Unknown')] - if not id_url or not title: + if not douban_id or not title: # Silently discard this entry return None - mi = Metadata(title_, authors) - mi.identifiers = {'douban':douban_id} - try: - log.info(id_url) - raw = get_details(browser, id_url, timeout) - feed = etree.fromstring( - xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], - parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) - ) - extra = entry(feed)[0] - except: - log.exception('Failed to get additional details for', mi.title) - return mi - mi.comments = get_text(extra, description) - mi.publisher = get_text(extra, publisher) + mi = Metadata(title, authors) + mi.identifiers = {'douban': douban_id} + mi.publisher = publisher + mi.comments = description + # mi.subtitle = subtitle # ISBN - isbns = [] - for x in [t.text for t in isbn(extra)]: + for x in isbns: if check_isbn(x): isbns.append(x) if isbns: @@ -140,52 +115,45 @@ class Douban(Source): mi.all_isbns = isbns # Tags - try: - btags = [x for x in booktag(extra) if x] - tags = [] - for t in btags: - atags = [y.strip() for y in t.split('/')] - for tag in atags: - if tag not in tags: - tags.append(tag) - except: - log.exception('Failed to parse tags:') - tags = [] - if tags: - mi.tags = [x.replace(',', ';') for x in tags] + mi.tags = [tag['name'] for tag in book_tags] # pubdate - pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: - log.error('Failed to parse pubdate %r'%pubdate) + log.error('Failed to parse pubdate %r' % pubdate) # Ratings - if rating(extra): + if rating: try: - mi.rating = float(rating(extra)[0]) / 2.0 + mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None - u = cover_url(extra) + u = cover_url if u: - u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u + + # Series + if series: + mi.series = series['title'] + return mi + # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: - return ('douban', db, self.DOUBAN_BOOK_URL%db) + return ('douban', db, self.DOUBAN_BOOK_URL % db) + # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ @@ -193,9 +161,9 @@ class Douban(Source): from urllib.parse import urlencode except ImportError: from urllib import urlencode - SEARCH_URL = 'https://api.douban.com/book/subjects?' - ISBN_URL = 'https://api.douban.com/book/subject/isbn/' - SUBJECT_URL = 'https://api.douban.com/book/subject/' + SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&' + ISBN_URL = 'https://api.douban.com/v2/book/isbn/' + SUBJECT_URL = 'https://api.douban.com/v2/book/' q = '' t = None @@ -208,16 +176,18 @@ class Douban(Source): q = subject t = 'subject' elif title or authors: + def build_term(prefix, parts): return ' '.join(x for x in parts) + title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) - author_tokens = list(self.get_author_tokens(authors, - only_first_author=True)) + author_tokens = list( + self.get_author_tokens(authors, only_first_author=True) + ) if author_tokens: - q += ((' ' if q != '' else '') + - build_term('author', author_tokens)) + q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' q = q.strip() if isinstance(q, type(u'')): @@ -231,24 +201,40 @@ class Douban(Source): url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ - 'q': q, - }) + 'q': q, + }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': if t == "isbn" or t == "subject": url = url + "?apikey=" + self.DOUBAN_API_KEY else: url = url + "&apikey=" + self.DOUBAN_API_KEY return url + # }}} - def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): + def download_cover( + self, + log, + result_queue, + abort, # {{{ + title=None, + authors=None, + identifiers={}, + timeout=30, + get_best_cover=False + ): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() - self.identify(log, rq, abort, title=title, authors=authors, - identifiers=identifiers) + self.identify( + log, + rq, + abort, + title=title, + authors=authors, + identifiers=identifiers + ) if abort.is_set(): return results = [] @@ -257,8 +243,11 @@ class Douban(Source): results.append(rq.get_nowait()) except Empty: break - results.sort(key=self.identify_results_keygen( - title=title, authors=authors, identifiers=identifiers)) + results.sort( + key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers + ) + ) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: @@ -291,11 +280,18 @@ class Douban(Source): url = self.cached_identifier_to_cover_url(db) return url + # }}} - def get_all_details(self, br, log, entries, abort, # {{{ - result_queue, timeout): - from lxml import etree + def get_all_details( + self, + br, + log, + entries, + abort, # {{{ + result_queue, + timeout + ): for relevance, i in enumerate(entries): try: ans = self.to_metadata(br, log, i, timeout) @@ -305,29 +301,31 @@ class Douban(Source): for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, db) if ans.has_douban_cover: - self.cache_identifier_to_cover_url(db, - ans.has_douban_cover) + self.cache_identifier_to_cover_url(db, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: - log.exception( - 'Failed to get metadata for identify entry:', - etree.tostring(i)) + log.exception('Failed to get metadata for identify entry:', i) if abort.is_set(): break - # }}} - - def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ - identifiers={}, timeout=30): - from lxml import etree - from calibre.ebooks.chardet import xml_to_unicode - from calibre.utils.cleantext import clean_ascii_chars - XPath = partial(etree.XPath, namespaces=NAMESPACES) - entry = XPath('//atom:entry') + # }}} - query = self.create_query(log, title=title, authors=authors, - identifiers=identifiers) + def identify( + self, + log, + result_queue, + abort, + title=None, + authors=None, # {{{ + identifiers={}, + timeout=30 + ): + import json + + query = self.create_query( + log, title=title, authors=authors, identifiers=identifiers + ) if not query: log.error('Insufficient metadata to construct query') return @@ -335,45 +333,51 @@ class Douban(Source): try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: - log.exception('Failed to make identify query: %r'%query) + log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: - parser = etree.XMLParser(recover=True, no_network=True) - feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), - strip_encoding_pats=True)[0], parser=parser) - entries = entry(feed) + entries = json.loads(raw)['books'] except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and identifiers and title and authors and \ not abort.is_set(): - return self.identify(log, result_queue, abort, title=title, - authors=authors, timeout=timeout) - + return self.identify( + log, + result_queue, + abort, + title=title, + authors=authors, + timeout=timeout + ) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None + # }}} if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py - from calibre.ebooks.metadata.sources.test import (test_identify_plugin, - title_test, authors_test) - test_identify_plugin(Douban.name, - [ - ( - {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', - 'authors':['刘慈欣']}, - [title_test('三体', exact=True), - authors_test(['刘慈欣'])] - ), - - ( - {'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, - [title_test('Linux内核修炼之道', exact=False)] - ), - ]) + from calibre.ebooks.metadata.sources.test import ( + test_identify_plugin, title_test, authors_test + ) + test_identify_plugin( + Douban.name, [ + ({ + 'identifiers': { + 'isbn': '9787536692930' + }, + 'title': '三体', + 'authors': ['刘慈欣'] + }, [title_test('三体', exact=True), + authors_test(['刘慈欣'])]), + ({ + 'title': 'Linux内核修炼之道', + 'authors': ['任桥伟'] + }, [title_test('Linux内核修炼之道', exact=False)]), + ] + ) # }}} -- 2.25.1