From 120d91a035101c7f905dea7d51c9dba6cace9c6e Mon Sep 17 00:00:00 2001 From: Leonardo Cavaletti Date: Sun, 19 Jul 2020 15:22:53 +0100 Subject: [PATCH] Added better support for daily free books and supplement fields --- blinkistscraper/generator.py | 18 ++++++++------- blinkistscraper/scraper.py | 45 +++++++++++++++++++++++++++--------- templates/chapter.html | 2 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/blinkistscraper/generator.py b/blinkistscraper/generator.py index 647299b..3679d0e 100644 --- a/blinkistscraper/generator.py +++ b/blinkistscraper/generator.py @@ -37,10 +37,10 @@ def generate_book_html(book_json_or_file, cover_img_file=False): for chapter_json in book_json['chapters']: chapter_html = chapter_template for chapter_key in chapter_json: - if chapter_json[chapter_key]: - chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key])) - else: - chapter_html = chapter_html.replace(f'{{{chapter_key}}}', "") + # sanitize null keys (e.g. supplement) + if not chapter_json[chapter_key]: + chapter_json[chapter_key] = "" + chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key])) chapters_html.append(chapter_html) book_html = book_html.replace('{__chapters__}', "\n".join(chapters_html)) @@ -77,10 +77,12 @@ def generate_book_epub(book_json_or_file): for chapter_json in book_json['chapters']: chapter = epub.EpubHtml(title=chapter_json['title'], file_name=f"chapter_{chapter_json['order_no']}.xhtml", lang='hr') - if chapter_json['supplement']: - chapter.content = f"

{chapter_json['title']}

" + chapter_json['text'] + chapter_json['supplement'] - else: - chapter.content = f"

{chapter_json['title']}

" + chapter_json['text'] + title = chapter_json.get("title") + content = chapter_json.get("content") + supplement = chapter_json.get("supplement") or "" + + chapter.content = f"

{title}

" + content + supplement + book.add_item(chapter) chapters.append(chapter) diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py index a5d82dc..f9499fb 100644 --- a/blinkistscraper/scraper.py +++ b/blinkistscraper/scraper.py @@ -209,17 +209,40 @@ def scrape_book_data(driver, book_url, match_language="", category={ "label" : " book['title'] = sanitize_name(book['title']) book['author'] = sanitize_name(book['author']) - # scrape the chapter's content on the reader page - # and extend the book json data by inserting the scraped content - # in the appropriate chapter section to get a complete data file - book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter"); - for chapter in book_chapters: - chapter_no = chapter.get_attribute('data-chapterno') - chapter_content = chapter.find_element_by_class_name("chapter__content") - for chapter_json in book['chapters']: - if chapter_json['order_no'] == int(chapter_no): - chapter_json['content'] = chapter_content.get_attribute('innerHTML') - break + # check if the book's metadata already has chapter content + # (this is the case for the free book of the day) + json_needs_content = False; + for chapter_json in book['chapters']: + if not "text" in chapter_json: + json_needs_content = True + break; + else: + # change the text content key name for compatibility with the script methods + chapter_json['content'] = chapter_json.pop('text') + + if json_needs_content: + # scrape the chapter's content on the reader page + # and extend the book json data by inserting the scraped content + # in the appropriate chapter section to get a complete data file + book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter"); + for chapter in book_chapters: + chapter_no = chapter.get_attribute('data-chapterno') + chapter_content = chapter.find_element_by_class_name("chapter__content") + for chapter_json in book['chapters']: + if chapter_json['order_no'] == int(chapter_no): + chapter_json['content'] = chapter_content.get_attribute('innerHTML') + break + + # look for any supplement sections + book_supplements = driver.find_elements(By.CSS_SELECTOR, ".chapter.supplement"); + for supplement in book_supplements: + chapter_no = supplement.get_attribute('data-chapterno') + supplement_content = chapter.find_element_by_class_name("chapter__content") + for chapter_json in book['chapters']: + if chapter_json['order_no'] == int(chapter_no): + if not chapter_json.get("supplement", None): + chapter_json['supplement'] = supplement_content.get_attribute('innerHTML') + break # if we are scraping by category, add it to the book metadata book['category'] = category['label'] diff --git a/templates/chapter.html b/templates/chapter.html index 28aa7d5..f34aeb0 100644 --- a/templates/chapter.html +++ b/templates/chapter.html @@ -1,3 +1,3 @@

{title}

-{text} +{content} {supplement}