1414
1515def json_decode (output ):
1616 try :
17- return json .loads (output )
17+ return json .loads (output , encoding = "utf-8" )
1818 except :
1919 return None
2020
2121
2222# URLs
23- VERSION_NO = '1.2017.11.10 .1'
23+ VERSION_NO = '1.2017.12.21 .1'
2424
2525REQUEST_DELAY = 0 # Delay used when requesting HTML, may be good to have to prevent being banned from the site
2626
@@ -31,10 +31,10 @@ def json_decode(output):
3131THREAD_MAX = 20
3232
3333intl_sites = {
34- 'en' : { 'url' : 'www.audible.com' , 'rel_date' : u'Release Date ' , 'nar_by' : u'Narrated By' , 'nar_by2' : u'Narrated by' },
35- 'fr' : { 'url' : 'www.audible.fr' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2' : u'Lu par' },
36- 'de' : { 'url' : 'www.audible.de' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von' , 'rel_date2' : u'Veröffentlicht' },
37- 'it' : { 'url' : 'www.audible.it' , 'rel_date' : u'Data di Pubblicazione' , 'nar_by' : u'Narratore' },
34+ 'en' : { 'url' : 'www.audible.com' , 'urltitle' : u'title=' , ' rel_date' : u'Release date ' , 'nar_by' : u'Narrated By' , 'nar_by2' : u'Narrated by' },
35+ 'fr' : { 'url' : 'www.audible.fr' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2' : u'Lu par' },
36+ 'de' : { 'url' : 'www.audible.de' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von' , 'rel_date2' : u'Veröffentlicht' },
37+ 'it' : { 'url' : 'www.audible.it' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Data di Pubblicazione' , 'nar_by' : u'Narratore' },
3838 #'jp' : { 'url': 'www.audible.co.jp', 'rel_date' : u'N/A', 'nar_by' : u'ナレーター' }, # untested
3939 }
4040
@@ -58,6 +58,7 @@ def SetupUrls(sitetype, base, lang='en'):
5858 lang = sites_langs [base ]['lang' ]
5959 if lang in intl_sites :
6060 base = intl_sites [lang ]['url' ]
61+ urlsearchtitle = intl_sites [lang ]['urltitle' ]
6162 ctx ['REL_DATE' ]= intl_sites [lang ]['rel_date' ]
6263 ctx ['NAR_BY' ]= intl_sites [lang ]['nar_by' ]
6364 if 'rel_date2' in intl_sites [lang ]:
@@ -69,7 +70,7 @@ def SetupUrls(sitetype, base, lang='en'):
6970 else :
7071 ctx ['NAR_BY_INFO' ]= ctx ['NAR_BY' ]
7172 else :
72- ctx ['REL_DATE' ]= 'Release Date '
73+ ctx ['REL_DATE' ]= 'Release date '
7374 ctx ['REL_DATE_INFO' ]= ctx ['REL_DATE' ]
7475 ctx ['NAR_BY' ]= 'Narrated By'
7576 ctx ['NAR_BY_INFO' ]= 'Narrated by'
@@ -87,6 +88,7 @@ def SetupUrls(sitetype, base, lang='en'):
8788 base = 'www.audible.com'
8889 if lang in intl_sites :
8990 base = intl_sites [lang ]['url' ]
91+ urlsearchtitle = intl_sites [lang ]['urltitle' ]
9092 ctx ['REL_DATE' ]= intl_sites [lang ]['rel_date' ]
9193 ctx ['NAR_BY' ]= intl_sites [lang ]['nar_by' ]
9294 if 'rel_date2' in intl_sites [lang ]:
@@ -98,18 +100,19 @@ def SetupUrls(sitetype, base, lang='en'):
98100 else :
99101 ctx ['NAR_BY_INFO' ]= ctx ['NAR_BY' ]
100102 else :
101- ctx ['REL_DATE' ]= 'Release Date '
103+ ctx ['REL_DATE' ]= 'Release date '
102104 ctx ['REL_DATE_INFO' ]= ctx ['REL_DATE' ]
103105 ctx ['NAR_BY' ]= 'Narrated By'
104106 ctx ['NAR_BY_INFO' ]= 'Narrated by'
105107
106108
107109 AUD_BASE_URL = 'https://' + str (base ) + '/'
110+ AUD_TITLE_URL = urlsearchtitle
108111 ctx ['AUD_BOOK_INFO' ]= AUD_BASE_URL + 'pd/%s?ipRedirectOverride=true'
109112 ctx ['AUD_ARTIST_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchAuthor=%s&ipRedirectOverride=true'
110- ctx ['AUD_ALBUM_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchTitle= %s&x=41&ipRedirectOverride=true'
113+ ctx ['AUD_ALBUM_SEARCH_URL' ]= AUD_BASE_URL + 'search?' + AUD_TITLE_URL + ' %s&x=41&ipRedirectOverride=true'
111114 ctx ['AUD_KEYWORD_SEARCH_URL' ]= AUD_BASE_URL + 'search?filterby=field-keywords&advsearchKeywords=%s&x=41&ipRedirectOverride=true'
112- ctx ['AUD_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchTitle= {0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
115+ ctx ['AUD_SEARCH_URL' ]= AUD_BASE_URL + 'search?' + AUD_TITLE_URL + ' {0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
113116 return ctx
114117
115118
@@ -170,6 +173,7 @@ def doSearch(self, url, ctx):
170173 html = HTML .ElementFromURL (url , sleep = REQUEST_DELAY )
171174
172175 found = []
176+
173177 for r in html .xpath ('//div[a/img[@class="yborder"]]' ):
174178 date = self .getDateFromString (self .getStringContentFromXPath (r , 'text()[1]' ))
175179 title = self .getStringContentFromXPath (r , 'a[2]' )
@@ -265,7 +269,22 @@ def findDateInTitle(self, title):
265269 def doSearch (self , url , ctx ):
266270 html = HTML .ElementFromURL (url , sleep = REQUEST_DELAY )
267271 found = []
268-
272+ self .Log ('-----------------------------------------just before new xpath line--------------------' )
273+ for r in html .xpath ('//ul//li[contains(@class,"productListItem")]' ):
274+ datetext = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"releaseDateLabel")]/span' .decode ('utf-8' ))
275+ datetext = re .sub (r'[^0-9\-]' , '' ,datetext )
276+ date = self .getDateFromString (datetext )
277+ title = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul//a[1]' )
278+ murl = self .getAnchorUrlFromXPath (r , 'div/div/div/div/div/div/span/ul/li/h3//a[1]' )
279+ thumb = self .getImageUrlFromXPath (r , 'div/div/div/div/div/div/div[contains(@class,"responsive-product-square")]/div/a/img' )
280+ author = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"authorLabel")]/span/a[1]' )
281+ narrator = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"narratorLabel")]/span//a[1]' .format (ctx ['NAR_BY' ]).decode ('utf-8' ))
282+ self .Log ('---------------------------------------XPATH SEARCH HIT-----------------------------------------------' )
283+
284+ found .append ({'url' : murl , 'title' : title , 'date' : date , 'thumb' : thumb , 'author' : author , 'narrator' : narrator })
285+
286+ self .Log ('-----------------------------------------just after new xpath line--------------------' )
287+
269288 for r in html .xpath ('//div[contains (@class, "adbl-search-result")]' ):
270289 date = self .getDateFromString (self .getStringContentFromXPath (r , 'div/div/ul/li[contains (., "{0}")]/span[2]//text()' .format (ctx ['REL_DATE' ]).decode ('utf-8' )))
271290 title = self .getStringContentFromXPath (r , 'div/div/div/div/a[1]' )
@@ -352,7 +371,7 @@ def search(self, results, media, lang, manual):
352371 self .Log ('Found %s result(s) for query "%s"' , len (found ), normalizedName )
353372 i = 1
354373 for f in found :
355- self .Log (' %s. (title) %s (url)[%s] (date)(%s) (thumb){%s}' , i , f ['title' ], f ['url' ], str (f ['date' ]), f ['thumb' ])
374+ self .Log (' %s. (title) %s (author) %s ( url)[%s] (date)(%s) (thumb){%s}' , i , f ['title' ], f [ 'author ' ], f ['url' ], str (f ['date' ]), f ['thumb' ])
356375 i += 1
357376
358377 self .Log ('-----------------------------------------------------------------------' )
@@ -364,11 +383,17 @@ def search(self, results, media, lang, manual):
364383 self .Log ('URL For Breakdown: %s' , url )
365384
366385 # Get the id
386+ # for itemId in url.split('/') :
367387 for itemId in url .split ('/' ) :
368388 if re .match (r'B0[0-9A-Z]{8,8}' , itemId ):
369389 break
370390 itemId = None
371391
392+ #New Search results contain question marks after the ID
393+ for itemId in itemId .split ('?' ) :
394+ if re .match (r'B0[0-9A-Z]{8,8}' , itemId ):
395+ break
396+
372397 if len (itemId ) == 0 :
373398 Log ('No Match: %s' , url )
374399 continue
@@ -447,6 +472,7 @@ def update(self, metadata, media, lang, force=False):
447472 pass
448473
449474 date = None
475+ rating = None
450476 series = ''
451477 genre1 = None
452478 genre2 = None
@@ -466,10 +492,16 @@ def update(self, metadata, media, lang, force=False):
466492 self .Log ('---------------------------------------XPATH SEARCH HIT-----------------------------------------------' )
467493
468494 if date is None :
495+ #for r in html.xpath('//div[contains (@class,"slot bottomSlot")]/script[contains (@type, "application/ld+json")]'):
469496 for r in html .xpath ('//script[contains (@type, "application/ld+json")]' ):
470497 page_content = r .text_content ()
471- page_content = page_content .replace ('\n ' , '' ) # Remove and new lines. JSON doesn't like them.
472- page_content = re .sub (r'\\(?![bfnrtv\'\"\\])' , '' , page_content ) # Remove any backslashes that aren't escaping a character JSON needs escaped
498+ page_content = page_content .replace ('\n ' , '' )
499+ #page_content = page_content.replace('\'', '\\\'')
500+ #page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content)
501+ # Remove any backslashes that aren't escaping a character JSON needs escaped
502+ remove_inv_json_esc = re .compile (r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' )
503+ page_content = remove_inv_json_esc .sub (r'\1\\\2' , page_content )
504+ self .Log (page_content )
473505 json_data = json_decode (page_content )
474506 for json_data in json_data :
475507 if 'datePublished' in json_data :
@@ -499,7 +531,7 @@ def update(self, metadata, media, lang, force=False):
499531 #for key in json_data:
500532 # Log('{0}:{1}'.format(key, json_data[key]))
501533 genre1 = json_data ['itemListElement' ][1 ]['item' ]['name' ]
502- try : # Not all books have two genre tags.
534+ try :
503535 genre2 = json_data ['itemListElement' ][2 ]['item' ]['name' ]
504536 except :
505537 continue
0 commit comments