Skip to content

Commit

Permalink
Added better support for daily free books and supplement fields
Browse files Browse the repository at this point in the history
  • Loading branch information
leoncvlt committed Jul 19, 2020
1 parent 81bbaf4 commit 120d91a
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 20 deletions.
18 changes: 10 additions & 8 deletions blinkistscraper/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def generate_book_html(book_json_or_file, cover_img_file=False):
for chapter_json in book_json['chapters']:
chapter_html = chapter_template
for chapter_key in chapter_json:
if chapter_json[chapter_key]:
chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key]))
else:
chapter_html = chapter_html.replace(f'{{{chapter_key}}}', "")
# sanitize null keys (e.g. supplement)
if not chapter_json[chapter_key]:
chapter_json[chapter_key] = ""
chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key]))
chapters_html.append(chapter_html)

book_html = book_html.replace('{__chapters__}', "\n".join(chapters_html))
Expand Down Expand Up @@ -77,10 +77,12 @@ def generate_book_epub(book_json_or_file):
for chapter_json in book_json['chapters']:
chapter = epub.EpubHtml(title=chapter_json['title'], file_name=f"chapter_{chapter_json['order_no']}.xhtml", lang='hr')

if chapter_json['supplement']:
chapter.content = f"<h2>{chapter_json['title']}</h2>" + chapter_json['text'] + chapter_json['supplement']
else:
chapter.content = f"<h2>{chapter_json['title']}</h2>" + chapter_json['text']
title = chapter_json.get("title")
content = chapter_json.get("content")
supplement = chapter_json.get("supplement") or ""

chapter.content = f"<h2>{title}</h2>" + content + supplement

book.add_item(chapter)
chapters.append(chapter)

Expand Down
45 changes: 34 additions & 11 deletions blinkistscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,17 +209,40 @@ def scrape_book_data(driver, book_url, match_language="", category={ "label" : "
book['title'] = sanitize_name(book['title'])
book['author'] = sanitize_name(book['author'])

# scrape the chapter's content on the reader page
# and extend the book json data by inserting the scraped content
# in the appropriate chapter section to get a complete data file
book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter");
for chapter in book_chapters:
chapter_no = chapter.get_attribute('data-chapterno')
chapter_content = chapter.find_element_by_class_name("chapter__content")
for chapter_json in book['chapters']:
if chapter_json['order_no'] == int(chapter_no):
chapter_json['content'] = chapter_content.get_attribute('innerHTML')
break
# check if the book's metadata already has chapter content
# (this is the case for the free book of the day)
json_needs_content = False;
for chapter_json in book['chapters']:
if not "text" in chapter_json:
json_needs_content = True
break;
else:
# change the text content key name for compatibility with the script methods
chapter_json['content'] = chapter_json.pop('text')

if json_needs_content:
# scrape the chapter's content on the reader page
# and extend the book json data by inserting the scraped content
# in the appropriate chapter section to get a complete data file
book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter");
for chapter in book_chapters:
chapter_no = chapter.get_attribute('data-chapterno')
chapter_content = chapter.find_element_by_class_name("chapter__content")
for chapter_json in book['chapters']:
if chapter_json['order_no'] == int(chapter_no):
chapter_json['content'] = chapter_content.get_attribute('innerHTML')
break

# look for any supplement sections
book_supplements = driver.find_elements(By.CSS_SELECTOR, ".chapter.supplement");
for supplement in book_supplements:
chapter_no = supplement.get_attribute('data-chapterno')
supplement_content = chapter.find_element_by_class_name("chapter__content")
for chapter_json in book['chapters']:
if chapter_json['order_no'] == int(chapter_no):
if not chapter_json.get("supplement", None):
chapter_json['supplement'] = supplement_content.get_attribute('innerHTML')
break

# if we are scraping by category, add it to the book metadata
book['category'] = category['label']
Expand Down
2 changes: 1 addition & 1 deletion templates/chapter.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
<h2>{title}</h2>
{text}
{content}
{supplement}

0 comments on commit 120d91a

Please sign in to comment.