Skip to content

Commit 8f0a91a

Browse files
authored
Update due to Audible.com search function
audible.com updated its results page and at least one of the major search url variables. This update should adapt to those changes but still leave the international versioning in place.
1 parent 49a32b5 commit 8f0a91a

File tree

1 file changed

+47
-15
lines changed

1 file changed

+47
-15
lines changed

Contents/Code/__init__.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414

1515
def json_decode(output):
1616
try:
17-
return json.loads(output)
17+
return json.loads(output,encoding="utf-8")
1818
except:
1919
return None
2020

2121

2222
# URLs
23-
VERSION_NO = '1.2017.11.10.1'
23+
VERSION_NO = '1.2017.12.21.1'
2424

2525
REQUEST_DELAY = 0 # Delay used when requesting HTML, may be good to have to prevent being banned from the site
2626

@@ -31,10 +31,10 @@ def json_decode(output):
3131
THREAD_MAX = 20
3232

3333
intl_sites={
34-
'en' : { 'url': 'www.audible.com' , 'rel_date' : u'Release Date' , 'nar_by' : u'Narrated By' , 'nar_by2': u'Narrated by'},
35-
'fr' : { 'url': 'www.audible.fr' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2': u'Lu par'},
36-
'de' : { 'url': 'www.audible.de' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von', 'rel_date2': u'Veröffentlicht'},
37-
'it' : { 'url': 'www.audible.it' , 'rel_date' : u'Data di Pubblicazione', 'nar_by' : u'Narratore' },
34+
'en' : { 'url': 'www.audible.com' , 'urltitle' : u'title=' , 'rel_date' : u'Release date' , 'nar_by' : u'Narrated By' , 'nar_by2': u'Narrated by'},
35+
'fr' : { 'url': 'www.audible.fr' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2': u'Lu par'},
36+
'de' : { 'url': 'www.audible.de' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von', 'rel_date2': u'Veröffentlicht'},
37+
'it' : { 'url': 'www.audible.it' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Data di Pubblicazione', 'nar_by' : u'Narratore' },
3838
#'jp' : { 'url': 'www.audible.co.jp', 'rel_date' : u'N/A', 'nar_by' : u'ナレーター' }, # untested
3939
}
4040

@@ -58,6 +58,7 @@ def SetupUrls(sitetype, base, lang='en'):
5858
lang=sites_langs[base]['lang']
5959
if lang in intl_sites :
6060
base=intl_sites[lang]['url']
61+
urlsearchtitle=intl_sites[lang]['urltitle']
6162
ctx['REL_DATE']=intl_sites[lang]['rel_date']
6263
ctx['NAR_BY' ]=intl_sites[lang]['nar_by']
6364
if 'rel_date2' in intl_sites[lang]:
@@ -69,7 +70,7 @@ def SetupUrls(sitetype, base, lang='en'):
6970
else:
7071
ctx['NAR_BY_INFO' ]=ctx['NAR_BY' ]
7172
else:
72-
ctx['REL_DATE' ]='Release Date'
73+
ctx['REL_DATE' ]='Release date'
7374
ctx['REL_DATE_INFO']=ctx['REL_DATE']
7475
ctx['NAR_BY' ]='Narrated By'
7576
ctx['NAR_BY_INFO' ]='Narrated by'
@@ -87,6 +88,7 @@ def SetupUrls(sitetype, base, lang='en'):
8788
base='www.audible.com'
8889
if lang in intl_sites :
8990
base=intl_sites[lang]['url']
91+
urlsearchtitle=intl_sites[lang]['urltitle']
9092
ctx['REL_DATE']=intl_sites[lang]['rel_date']
9193
ctx['NAR_BY' ]=intl_sites[lang]['nar_by']
9294
if 'rel_date2' in intl_sites[lang]:
@@ -98,18 +100,19 @@ def SetupUrls(sitetype, base, lang='en'):
98100
else:
99101
ctx['NAR_BY_INFO' ]=ctx['NAR_BY' ]
100102
else:
101-
ctx['REL_DATE' ]='Release Date'
103+
ctx['REL_DATE' ]='Release date'
102104
ctx['REL_DATE_INFO']=ctx['REL_DATE']
103105
ctx['NAR_BY' ]='Narrated By'
104106
ctx['NAR_BY_INFO' ]='Narrated by'
105107

106108

107109
AUD_BASE_URL='https://' + str(base) + '/'
110+
AUD_TITLE_URL=urlsearchtitle
108111
ctx['AUD_BOOK_INFO' ]=AUD_BASE_URL + 'pd/%s?ipRedirectOverride=true'
109112
ctx['AUD_ARTIST_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchAuthor=%s&ipRedirectOverride=true'
110-
ctx['AUD_ALBUM_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchTitle=%s&x=41&ipRedirectOverride=true'
113+
ctx['AUD_ALBUM_SEARCH_URL' ]=AUD_BASE_URL + 'search?' + AUD_TITLE_URL + '%s&x=41&ipRedirectOverride=true'
111114
ctx['AUD_KEYWORD_SEARCH_URL']=AUD_BASE_URL + 'search?filterby=field-keywords&advsearchKeywords=%s&x=41&ipRedirectOverride=true'
112-
ctx['AUD_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchTitle={0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
115+
ctx['AUD_SEARCH_URL' ]=AUD_BASE_URL + 'search?' + AUD_TITLE_URL + '{0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
113116
return ctx
114117

115118

@@ -170,6 +173,7 @@ def doSearch(self, url, ctx):
170173
html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY)
171174

172175
found = []
176+
173177
for r in html.xpath('//div[a/img[@class="yborder"]]'):
174178
date = self.getDateFromString(self.getStringContentFromXPath(r, 'text()[1]'))
175179
title = self.getStringContentFromXPath(r, 'a[2]')
@@ -265,7 +269,22 @@ def findDateInTitle(self, title):
265269
def doSearch(self, url, ctx):
266270
html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY)
267271
found = []
268-
272+
self.Log('-----------------------------------------just before new xpath line--------------------')
273+
for r in html.xpath('//ul//li[contains(@class,"productListItem")]'):
274+
datetext = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"releaseDateLabel")]/span'.decode('utf-8'))
275+
datetext=re.sub(r'[^0-9\-]', '',datetext)
276+
date=self.getDateFromString(datetext)
277+
title = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul//a[1]')
278+
murl = self.getAnchorUrlFromXPath(r, 'div/div/div/div/div/div/span/ul/li/h3//a[1]')
279+
thumb = self.getImageUrlFromXPath(r, 'div/div/div/div/div/div/div[contains(@class,"responsive-product-square")]/div/a/img')
280+
author = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"authorLabel")]/span/a[1]')
281+
narrator = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"narratorLabel")]/span//a[1]'.format(ctx['NAR_BY']).decode('utf-8'))
282+
self.Log('---------------------------------------XPATH SEARCH HIT-----------------------------------------------')
283+
284+
found.append({'url': murl, 'title': title, 'date': date, 'thumb': thumb, 'author': author, 'narrator': narrator})
285+
286+
self.Log('-----------------------------------------just after new xpath line--------------------')
287+
269288
for r in html.xpath('//div[contains (@class, "adbl-search-result")]'):
270289
date = self.getDateFromString(self.getStringContentFromXPath(r, 'div/div/ul/li[contains (., "{0}")]/span[2]//text()'.format(ctx['REL_DATE']).decode('utf-8')))
271290
title = self.getStringContentFromXPath(r, 'div/div/div/div/a[1]')
@@ -352,7 +371,7 @@ def search(self, results, media, lang, manual):
352371
self.Log('Found %s result(s) for query "%s"', len(found), normalizedName)
353372
i = 1
354373
for f in found:
355-
self.Log(' %s. (title) %s (url)[%s] (date)(%s) (thumb){%s}', i, f['title'], f['url'], str(f['date']), f['thumb'])
374+
self.Log(' %s. (title) %s (author) %s (url)[%s] (date)(%s) (thumb){%s}', i, f['title'], f['author'], f['url'], str(f['date']), f['thumb'])
356375
i += 1
357376

358377
self.Log('-----------------------------------------------------------------------')
@@ -364,11 +383,17 @@ def search(self, results, media, lang, manual):
364383
self.Log('URL For Breakdown: %s', url)
365384

366385
# Get the id
386+
# for itemId in url.split('/') :
367387
for itemId in url.split('/') :
368388
if re.match(r'B0[0-9A-Z]{8,8}', itemId):
369389
break
370390
itemId=None
371391

392+
#New Search results contain question marks after the ID
393+
for itemId in itemId.split('?') :
394+
if re.match(r'B0[0-9A-Z]{8,8}', itemId):
395+
break
396+
372397
if len(itemId) == 0:
373398
Log('No Match: %s', url)
374399
continue
@@ -447,6 +472,7 @@ def update(self, metadata, media, lang, force=False):
447472
pass
448473

449474
date=None
475+
rating=None
450476
series=''
451477
genre1=None
452478
genre2=None
@@ -466,10 +492,16 @@ def update(self, metadata, media, lang, force=False):
466492
self.Log('---------------------------------------XPATH SEARCH HIT-----------------------------------------------')
467493

468494
if date is None :
495+
#for r in html.xpath('//div[contains (@class,"slot bottomSlot")]/script[contains (@type, "application/ld+json")]'):
469496
for r in html.xpath('//script[contains (@type, "application/ld+json")]'):
470497
page_content = r.text_content()
471-
page_content = page_content.replace('\n', '') # Remove and new lines. JSON doesn't like them.
472-
page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content) # Remove any backslashes that aren't escaping a character JSON needs escaped
498+
page_content = page_content.replace('\n', '')
499+
#page_content = page_content.replace('\'', '\\\'')
500+
#page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content)
501+
# Remove any backslashes that aren't escaping a character JSON needs escaped
502+
remove_inv_json_esc=re.compile(r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))')
503+
page_content=remove_inv_json_esc.sub(r'\1\\\2', page_content)
504+
self.Log(page_content)
473505
json_data=json_decode(page_content)
474506
for json_data in json_data:
475507
if 'datePublished' in json_data:
@@ -499,7 +531,7 @@ def update(self, metadata, media, lang, force=False):
499531
#for key in json_data:
500532
# Log('{0}:{1}'.format(key, json_data[key]))
501533
genre1=json_data['itemListElement'][1]['item']['name']
502-
try: # Not all books have two genre tags.
534+
try:
503535
genre2=json_data['itemListElement'][2]['item']['name']
504536
except:
505537
continue

0 commit comments

Comments
 (0)