From 68af12b51e5dc3fbcbe6e61040262a2788b15247 Mon Sep 17 00:00:00 2001 From: johndoe-dev00 Date: Fri, 26 Mar 2021 22:17:36 +0100 Subject: [PATCH 1/6] improved captcha and cookie handling --- blinkistscraper/scraper.py | 46 +++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py index 8105583..92765db 100644 --- a/blinkistscraper/scraper.py +++ b/blinkistscraper/scraper.py @@ -131,29 +131,53 @@ def initialize_driver(headless=True, with_ublock=False, chromedriver_path=None): def login(driver, language, email, password): # we need to navigate to a page first in order to load eventual cookies - driver.get(f"https://www.blinkist.com/{language}") + driver.get(f"https://www.blinkist.com/{language}/nc/login") is_logged_in = False # if we have any stored login cookie, load them into the driver if has_login_cookies(): load_login_cookies(driver) + + # assume that a captcha needs to be solved, if no blinkist logo appears within 5sec + try: + WebDriverWait(driver, 5).until( + EC.presence_of_element_located( + (By.CLASS_NAME, "header__logo") + ) + ) + except TimeoutException as ex: + log.info("Please solve captcha to proceed!") + + # fail if captcha not solved within 60sec + try: + WebDriverWait(driver, 60).until( + EC.presence_of_element_located( + (By.CLASS_NAME, "header__logo") + ) + ) + except TimeoutException as ex: + log.ERROR("Error. Captcha needs to be solved within 1 minute") + return False - # navigate to the login page and check for the login email input - # if not found, assume we're logged in + # navigate to the login page sign_in_url = f"https://www.blinkist.com/{language}/nc/login" driver.get(sign_in_url) + + # click on cookie banner, if necessary + time.sleep(1.0) + try: + cookiebanner = driver.find_element_by_class_name("cookie-disclaimer__cta") + except: + pass + else: + cookiebanner.click() + + # check for the login email input. if not found, assume we're logged in try: driver.find_element_by_id("login-form_login_email") except NoSuchElementException: is_logged_in = True - # try: - # WebDriverWait(driver, 360).until( - # EC.presence_of_element_located((By.ID, "login-form_login_email")) - # ) - # except TimeoutException as ex: - # log.error("Error logging in.") - # return False - + # if not logged in, autofill the email and password inputs with the # provided login credentials if not is_logged_in: From 9b2bdf47fec923b5c8a9db5d96145f88c948256f Mon Sep 17 00:00:00 2001 From: johndoe-dev00 Date: Fri, 26 Mar 2021 21:37:30 +0100 Subject: [PATCH 2/6] added cli-option to disable ublock --- blinkistscraper/__main__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/blinkistscraper/__main__.py b/blinkistscraper/__main__.py index 5338f17..a602995 100644 --- a/blinkistscraper/__main__.py +++ b/blinkistscraper/__main__.py @@ -179,6 +179,12 @@ def check_cooldown(value): "--chromedriver", help="Path to a specific chromedriver executable instead of the built-in one", ) + parser.add_argument( + "--no-ublock", + action="store_true", + default=False, + help="Disable the uBlock Chrome extension", + ) parser.add_argument( "-v", "--verbose", action="store_true", help="Increases logging verbosity" ) @@ -274,7 +280,7 @@ def finish(start_time, processed_books, driver=None): match_language = args.language if args.match_language else "" start_headless = args.headless # add uBlock (except on headless) - use_ublock = not args.headless + use_ublock = not args.no_ublock and not args.headless driver = scraper.initialize_driver( headless=start_headless, with_ublock=use_ublock, From 55c59a6e94cceeee28755a5b9979dc520b9bd15a Mon Sep 17 00:00:00 2001 From: johndoe-dev00 Date: Fri, 26 Mar 2021 22:25:41 +0100 Subject: [PATCH 3/6] fixed typo --- blinkistscraper/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py index 92765db..2e02a51 100644 --- a/blinkistscraper/scraper.py +++ b/blinkistscraper/scraper.py @@ -156,7 +156,7 @@ def login(driver, language, email, password): ) ) except TimeoutException as ex: - log.ERROR("Error. Captcha needs to be solved within 1 minute") + log.error("Error. Captcha needs to be solved within 1 minute") return False # navigate to the login page From 86f93f9f57d7e9f6ffa0b1987036cb71bf62d0db Mon Sep 17 00:00:00 2001 From: johndoe-dev00 Date: Fri, 26 Mar 2021 22:35:33 +0100 Subject: [PATCH 4/6] clarify --help text --- blinkistscraper/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blinkistscraper/__main__.py b/blinkistscraper/__main__.py index a602995..23a486f 100644 --- a/blinkistscraper/__main__.py +++ b/blinkistscraper/__main__.py @@ -183,7 +183,7 @@ def check_cooldown(value): "--no-ublock", action="store_true", default=False, - help="Disable the uBlock Chrome extension", + help="Disable the uBlock Chrome extension. Might be needed to solve captcha", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Increases logging verbosity" From 0bcb43d5d46934b9c318ecc02776c8724f76cd15 Mon Sep 17 00:00:00 2001 From: johndoe-dev00 <81249793+johndoe-dev00@users.noreply.github.com> Date: Tue, 30 Mar 2021 18:55:21 +0200 Subject: [PATCH 5/6] allow hcaptcha.com, so captcha will load correctly --- bin/ublock/ublock-settings.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/ublock/ublock-settings.txt b/bin/ublock/ublock-settings.txt index 458f77b..783b298 100644 --- a/bin/ublock/ublock-settings.txt +++ b/bin/ublock/ublock-settings.txt @@ -62,7 +62,7 @@ "wyciwyg-scheme" ], "netWhitelist": "about-scheme\nchrome-extension-scheme\nchrome-scheme\nmoz-extension-scheme\nopera-scheme\nvivaldi-scheme\nwyciwyg-scheme", - "dynamicFilteringString": "behind-the-scene * * noop\nbehind-the-scene * inline-script noop\nbehind-the-scene * 1p-script noop\nbehind-the-scene * 3p-script noop\nbehind-the-scene * 3p-frame noop\nbehind-the-scene * image noop\nbehind-the-scene * 3p noop\n* * 3p block\nwww.blinkist.com blinkist.io * allow\nwww.blinkist.com d17pjsg7x52x9r.cloudfront.net * allow\nwww.blinkist.com jsdelivr.net * block\nwww.blinkist.com hcaptcha.com * block", + "dynamicFilteringString": "behind-the-scene * * noop\nbehind-the-scene * inline-script noop\nbehind-the-scene * 1p-script noop\nbehind-the-scene * 3p-script noop\nbehind-the-scene * 3p-frame noop\nbehind-the-scene * image noop\nbehind-the-scene * 3p noop\n* * 3p block\nwww.blinkist.com blinkist.io * allow\nwww.blinkist.com d17pjsg7x52x9r.cloudfront.net * allow\nwww.blinkist.com jsdelivr.net * block\nwww.blinkist.com hcaptcha.com * allow", "urlFilteringString": "", "hostnameSwitchesString": "no-large-media: behind-the-scene false\nno-remote-fonts: * true", "userFilters": "" From 966f0bb8b480dbcd63c5a0012cfc16aaf2807650 Mon Sep 17 00:00:00 2001 From: johndoe-dev00 <81249793+johndoe-dev00@users.noreply.github.com> Date: Tue, 30 Mar 2021 18:59:05 +0200 Subject: [PATCH 6/6] updated README with new cli-switch --no-ublock --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4d91e99..ff81ce0 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ optional arguments: --chromedriver CHROMEDRIVER Path to a specific chromedriver executable instead of the built-in one + --no-ublock Disable the uBlock Chrome extension. Might be needed + to solve captcha -v, --verbose Increases logging verbosity ```