Added better support for daily free books and supplement fields

leoncvlt · Jul 19, 2020 · 120d91a · 120d91a
1 parent 81bbaf4
commit 120d91a
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 20 deletions.
diff --git a/blinkistscraper/generator.py b/blinkistscraper/generator.py
@@ -37,10 +37,10 @@ def generate_book_html(book_json_or_file, cover_img_file=False):
     for chapter_json in book_json['chapters']:
       chapter_html = chapter_template
       for chapter_key in chapter_json:
-        if chapter_json[chapter_key]:
-          chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key]))
-        else:
-          chapter_html = chapter_html.replace(f'{{{chapter_key}}}', "")
+        # sanitize null keys (e.g. supplement)
+        if not chapter_json[chapter_key]:
+          chapter_json[chapter_key] = ""
+        chapter_html = chapter_html.replace(f'{{{chapter_key}}}', str(chapter_json[chapter_key]))
       chapters_html.append(chapter_html)
 
   book_html = book_html.replace('{__chapters__}', "\n".join(chapters_html))
@@ -77,10 +77,12 @@ def generate_book_epub(book_json_or_file):
   for chapter_json in book_json['chapters']:
     chapter = epub.EpubHtml(title=chapter_json['title'], file_name=f"chapter_{chapter_json['order_no']}.xhtml", lang='hr')
 
-    if chapter_json['supplement']:
-      chapter.content = f"<h2>{chapter_json['title']}</h2>" + chapter_json['text'] + chapter_json['supplement']
-    else:
-      chapter.content = f"<h2>{chapter_json['title']}</h2>" + chapter_json['text']
+    title = chapter_json.get("title")
+    content = chapter_json.get("content")
+    supplement = chapter_json.get("supplement") or ""
+
+    chapter.content = f"<h2>{title}</h2>" + content + supplement
+
     book.add_item(chapter)
     chapters.append(chapter)
 

diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py
@@ -209,17 +209,40 @@ def scrape_book_data(driver, book_url, match_language="", category={ "label" : "
   book['title'] = sanitize_name(book['title'])
   book['author'] = sanitize_name(book['author'])
 
-  # scrape the chapter's content on the reader page
-  # and extend the book json data by inserting the scraped content
-  # in the appropriate chapter section to get a complete data file
-  book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter");
-  for chapter in book_chapters:
-    chapter_no = chapter.get_attribute('data-chapterno')
-    chapter_content = chapter.find_element_by_class_name("chapter__content")
-    for chapter_json in book['chapters']:
-      if chapter_json['order_no'] == int(chapter_no):
-        chapter_json['content'] = chapter_content.get_attribute('innerHTML')
-        break
+  # check if the book's metadata already has chapter content
+  # (this is the case for the free book of the day)
+  json_needs_content = False;
+  for chapter_json in book['chapters']:
+    if not "text" in chapter_json:
+      json_needs_content = True
+      break;
+    else:
+      # change the text content key name for compatibility with the script methods
+      chapter_json['content'] = chapter_json.pop('text')
+
+  if json_needs_content:
+    # scrape the chapter's content on the reader page
+    # and extend the book json data by inserting the scraped content
+    # in the appropriate chapter section to get a complete data file
+    book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter");
+    for chapter in book_chapters:
+      chapter_no = chapter.get_attribute('data-chapterno')
+      chapter_content = chapter.find_element_by_class_name("chapter__content")
+      for chapter_json in book['chapters']:
+        if chapter_json['order_no'] == int(chapter_no):
+          chapter_json['content'] = chapter_content.get_attribute('innerHTML')
+          break
+
+    # look for any supplement sections
+    book_supplements = driver.find_elements(By.CSS_SELECTOR, ".chapter.supplement");
+    for supplement in book_supplements:
+      chapter_no = supplement.get_attribute('data-chapterno')
+      supplement_content = chapter.find_element_by_class_name("chapter__content")
+      for chapter_json in book['chapters']:
+        if chapter_json['order_no'] == int(chapter_no):
+          if not chapter_json.get("supplement", None):
+            chapter_json['supplement'] = supplement_content.get_attribute('innerHTML')
+          break
 
   # if we are scraping by category, add it to the book metadata
   book['category'] = category['label']

diff --git a/templates/chapter.html b/templates/chapter.html
@@ -1,3 +1,3 @@
 <h2>{title}</h2>
-{text}
+{content}
 {supplement}