1414
1515def json_decode (output ):
1616 try :
17- return json .loads (output )
17+ return json .loads (output , encoding = "utf-8" )
1818 except :
1919 return None
2020
2121
2222# URLs
23- VERSION_NO = '1.2017.11.10 .1'
23+ VERSION_NO = '1.2017.12.21 .1'
2424
2525REQUEST_DELAY = 0 # Delay used when requesting HTML, may be good to have to prevent being banned from the site
2626
@@ -31,10 +31,10 @@ def json_decode(output):
3131THREAD_MAX = 20
3232
3333intl_sites = {
34- 'en' : { 'url' : 'www.audible.com' , 'rel_date' : u'Release Date ' , 'nar_by' : u'Narrated By' , 'nar_by2' : u'Narrated by' },
35- 'fr' : { 'url' : 'www.audible.fr' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2' : u'Lu par' },
36- 'de' : { 'url' : 'www.audible.de' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von' , 'rel_date2' : u'Veröffentlicht' },
37- 'it' : { 'url' : 'www.audible.it' , 'rel_date' : u'Data di Pubblicazione' , 'nar_by' : u'Narratore' },
34+ 'en' : { 'url' : 'www.audible.com' , 'urltitle' : u'title=' , ' rel_date' : u'Release date ' , 'nar_by' : u'Narrated By' , 'nar_by2' : u'Narrated by' },
35+ 'fr' : { 'url' : 'www.audible.fr' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2' : u'Lu par' },
36+ 'de' : { 'url' : 'www.audible.de' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von' , 'rel_date2' : u'Veröffentlicht' },
37+ 'it' : { 'url' : 'www.audible.it' , 'urltitle' : u'searchTitle=' , ' rel_date' : u'Data di Pubblicazione' , 'nar_by' : u'Narratore' },
3838 #'jp' : { 'url': 'www.audible.co.jp', 'rel_date' : u'N/A', 'nar_by' : u'ナレーター' }, # untested
3939 }
4040
@@ -57,6 +57,7 @@ def SetupUrls(sitetype, base, lang='en'):
5757 Log ('Pulling language from sites array' )
5858 lang = sites_langs [base ]['lang' ]
5959 if lang in intl_sites :
60+ urlsearchtitle = intl_sites [lang ]['urltitle' ]
6061 ctx ['REL_DATE' ]= intl_sites [lang ]['rel_date' ]
6162 ctx ['NAR_BY' ]= intl_sites [lang ]['nar_by' ]
6263 if 'rel_date2' in intl_sites [lang ]:
@@ -68,7 +69,7 @@ def SetupUrls(sitetype, base, lang='en'):
6869 else :
6970 ctx ['NAR_BY_INFO' ]= ctx ['NAR_BY' ]
7071 else :
71- ctx ['REL_DATE' ]= 'Release Date '
72+ ctx ['REL_DATE' ]= 'Release date '
7273 ctx ['REL_DATE_INFO' ]= ctx ['REL_DATE' ]
7374 ctx ['NAR_BY' ]= 'Narrated By'
7475 ctx ['NAR_BY_INFO' ]= 'Narrated by'
@@ -86,6 +87,7 @@ def SetupUrls(sitetype, base, lang='en'):
8687 base = 'www.audible.com'
8788 if lang in intl_sites :
8889 base = intl_sites [lang ]['url' ]
90+ urlsearchtitle = intl_sites [lang ]['urltitle' ]
8991 ctx ['REL_DATE' ]= intl_sites [lang ]['rel_date' ]
9092 ctx ['NAR_BY' ]= intl_sites [lang ]['nar_by' ]
9193 if 'rel_date2' in intl_sites [lang ]:
@@ -97,18 +99,19 @@ def SetupUrls(sitetype, base, lang='en'):
9799 else :
98100 ctx ['NAR_BY_INFO' ]= ctx ['NAR_BY' ]
99101 else :
100- ctx ['REL_DATE' ]= 'Release Date '
102+ ctx ['REL_DATE' ]= 'Release date '
101103 ctx ['REL_DATE_INFO' ]= ctx ['REL_DATE' ]
102104 ctx ['NAR_BY' ]= 'Narrated By'
103105 ctx ['NAR_BY_INFO' ]= 'Narrated by'
104106
105107
106108 AUD_BASE_URL = 'https://' + str (base ) + '/'
109+ AUD_TITLE_URL = urlsearchtitle
107110 ctx ['AUD_BOOK_INFO' ]= AUD_BASE_URL + 'pd/%s?ipRedirectOverride=true'
108111 ctx ['AUD_ARTIST_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchAuthor=%s&ipRedirectOverride=true'
109- ctx ['AUD_ALBUM_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchTitle= %s&x=41&ipRedirectOverride=true'
112+ ctx ['AUD_ALBUM_SEARCH_URL' ]= AUD_BASE_URL + 'search?' + AUD_TITLE_URL + ' %s&x=41&ipRedirectOverride=true'
110113 ctx ['AUD_KEYWORD_SEARCH_URL' ]= AUD_BASE_URL + 'search?filterby=field-keywords&advsearchKeywords=%s&x=41&ipRedirectOverride=true'
111- ctx ['AUD_SEARCH_URL' ]= AUD_BASE_URL + 'search?searchTitle= {0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
114+ ctx ['AUD_SEARCH_URL' ]= AUD_BASE_URL + 'search?' + AUD_TITLE_URL + ' {0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
112115 return ctx
113116
114117
@@ -169,6 +172,7 @@ def doSearch(self, url, ctx):
169172 html = HTML .ElementFromURL (url , sleep = REQUEST_DELAY )
170173
171174 found = []
175+
172176 for r in html .xpath ('//div[a/img[@class="yborder"]]' ):
173177 date = self .getDateFromString (self .getStringContentFromXPath (r , 'text()[1]' ))
174178 title = self .getStringContentFromXPath (r , 'a[2]' )
@@ -264,7 +268,22 @@ def findDateInTitle(self, title):
264268 def doSearch (self , url , ctx ):
265269 html = HTML .ElementFromURL (url , sleep = REQUEST_DELAY )
266270 found = []
267-
271+ self .Log ('-----------------------------------------just before new xpath line--------------------' )
272+ for r in html .xpath ('//ul//li[contains(@class,"productListItem")]' ):
273+ datetext = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"releaseDateLabel")]/span' .decode ('utf-8' ))
274+ datetext = re .sub (r'[^0-9\-]' , '' ,datetext )
275+ date = self .getDateFromString (datetext )
276+ title = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul//a[1]' )
277+ murl = self .getAnchorUrlFromXPath (r , 'div/div/div/div/div/div/span/ul/li/h3//a[1]' )
278+ thumb = self .getImageUrlFromXPath (r , 'div/div/div/div/div/div/div[contains(@class,"responsive-product-square")]/div/a/img' )
279+ author = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"authorLabel")]/span/a[1]' )
280+ narrator = self .getStringContentFromXPath (r , 'div/div/div/div/div/div/span/ul/li[contains (@class,"narratorLabel")]/span//a[1]' .format (ctx ['NAR_BY' ]).decode ('utf-8' ))
281+ self .Log ('---------------------------------------XPATH SEARCH HIT-----------------------------------------------' )
282+
283+ found .append ({'url' : murl , 'title' : title , 'date' : date , 'thumb' : thumb , 'author' : author , 'narrator' : narrator })
284+
285+ self .Log ('-----------------------------------------just after new xpath line--------------------' )
286+
268287 for r in html .xpath ('//div[contains (@class, "adbl-search-result")]' ):
269288 date = self .getDateFromString (self .getStringContentFromXPath (r , 'div/div/ul/li[contains (., "{0}")]/span[2]//text()' .format (ctx ['REL_DATE' ]).decode ('utf-8' )))
270289 title = self .getStringContentFromXPath (r , 'div/div/div/div/a[1]' )
@@ -351,7 +370,7 @@ def search(self, results, media, lang, manual):
351370 self .Log ('Found %s result(s) for query "%s"' , len (found ), normalizedName )
352371 i = 1
353372 for f in found :
354- self .Log (' %s. (title) %s (url)[%s] (date)(%s) (thumb){%s}' , i , f ['title' ], f ['url' ], str (f ['date' ]), f ['thumb' ])
373+ self .Log (' %s. (title) %s (author) %s ( url)[%s] (date)(%s) (thumb){%s}' , i , f ['title' ], f [ 'author ' ], f ['url' ], str (f ['date' ]), f ['thumb' ])
355374 i += 1
356375
357376 self .Log ('-----------------------------------------------------------------------' )
@@ -363,11 +382,17 @@ def search(self, results, media, lang, manual):
363382 self .Log ('URL For Breakdown: %s' , url )
364383
365384 # Get the id
385+ # for itemId in url.split('/') :
366386 for itemId in url .split ('/' ) :
367387 if re .match (r'B0[0-9A-Z]{8,8}' , itemId ):
368388 break
369389 itemId = None
370390
391+ #New Search results contain question marks after the ID
392+ for itemId in itemId .split ('?' ) :
393+ if re .match (r'B0[0-9A-Z]{8,8}' , itemId ):
394+ break
395+
371396 if len (itemId ) == 0 :
372397 Log ('No Match: %s' , url )
373398 continue
@@ -446,10 +471,10 @@ def update(self, metadata, media, lang, force=False):
446471 pass
447472
448473 date = None
474+ rating = None
449475 series = ''
450476 genre1 = None
451477 genre2 = None
452- rating = 0
453478
454479 for r in html .xpath ('//div[contains (@id, "adbl_page_content")]' ):
455480 date = self .getDateFromString (self .getStringContentFromXPath (r , '//li[contains (., "{0}")]/span[2]//text()' .format (ctx ['REL_DATE_INFO' ]).decode ('utf-8' )))
@@ -466,10 +491,16 @@ def update(self, metadata, media, lang, force=False):
466491 self .Log ('---------------------------------------XPATH SEARCH HIT-----------------------------------------------' )
467492
468493 if date is None :
494+ #for r in html.xpath('//div[contains (@class,"slot bottomSlot")]/script[contains (@type, "application/ld+json")]'):
469495 for r in html .xpath ('//script[contains (@type, "application/ld+json")]' ):
470496 page_content = r .text_content ()
471- page_content = page_content .replace ('\n ' , '' ) # Remove and new lines. JSON doesn't like them.
472- page_content = re .sub (r'\\(?![bfnrtv\'\"\\])' , '' , page_content ) # Remove any backslashes that aren't escaping a character JSON needs escaped
497+ page_content = page_content .replace ('\n ' , '' )
498+ #page_content = page_content.replace('\'', '\\\'')
499+ #page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content)
500+ # Remove any backslashes that aren't escaping a character JSON needs escaped
501+ remove_inv_json_esc = re .compile (r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' )
502+ page_content = remove_inv_json_esc .sub (r'\1\\\2' , page_content )
503+ self .Log (page_content )
473504 json_data = json_decode (page_content )
474505 for json_data in json_data :
475506 if 'datePublished' in json_data :
@@ -499,7 +530,7 @@ def update(self, metadata, media, lang, force=False):
499530 #for key in json_data:
500531 # Log('{0}:{1}'.format(key, json_data[key]))
501532 genre1 = json_data ['itemListElement' ][1 ]['item' ]['name' ]
502- try : # Not all books have two genre tags.
533+ try :
503534 genre2 = json_data ['itemListElement' ][2 ]['item' ]['name' ]
504535 except :
505536 continue
0 commit comments