From c4429d0533e0153fccb46dd044ebca5bf81fd5cb Mon Sep 17 00:00:00 2001 From: Dereck Smith Date: Sat, 30 Mar 2024 19:17:16 +0530 Subject: [PATCH] Filters are ready to use! Now I should write the spec for it. --- src/mail_harvester/core.clj | 20 +++++++++++++++--- src/mail_harvester/scraper.clj | 37 ++++++++++++++++++++-------------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/mail_harvester/core.clj b/src/mail_harvester/core.clj index 5bf89ff..a53a916 100644 --- a/src/mail_harvester/core.clj +++ b/src/mail_harvester/core.clj @@ -3,7 +3,9 @@ (:require [cljfx.api :as fx] [cljfx.css :as css] [mail-harvester.scraper :as scraper] - [clojure.core.async :refer [thread]]) + [clojure.core.async :refer [thread]] + [clojure.data.csv :as csv] + [clojure.java.io :as io]) (:import [javafx.application Platform] [javafx.stage FileChooser] [javafx.event ActionEvent] @@ -15,8 +17,12 @@ (def *state (atom {:status "Not in use" :url "" + :filters [] :browser "Chrome"})) +(def filters + (atom '())) + ;; The theme for the app (def style (css/register ::style @@ -78,6 +84,7 @@ (thread (try (let [res (scraper/scrape-url (-> @*state :url) (-> @*state :browser) + @filters "emails")] ;; Export it to a CSV (scraper/write-to-exports res "emails") @@ -86,6 +93,7 @@ (catch Exception e ;; Write an error log (spit "error.txt" e) + (println e) ;; And tell the user an error occured (swap! *state assoc :status "Error! Please file an issue on GitHub"))))) :text "Scrape URL for emails"}) @@ -101,6 +109,7 @@ (thread (try (let [res (scraper/scrape-url (-> @*state :url) (-> @*state :browser) + @filters "links")] ;; Export it to a CSV (scraper/write-to-exports res "links") @@ -108,6 +117,7 @@ (swap! *state assoc :status "Scraping Done!")) (catch Exception e ;; Write an error log + (spit "error.txt" e) (println e) ;; And tell the user an error occured (swap! *state assoc :status "Error! Please file an issue on GitHub"))))) @@ -121,9 +131,13 @@ :on-action (fn [^ActionEvent event] (let [window (.getWindow (.getScene ^Node (.getTarget event))) chooser (doto (FileChooser.) - (.setTitle "Select CSV Filters"))] + (.setTitle "Select Filters"))] (when-let [file (.showOpenDialog chooser window)] - (swap! *state :filters (slurp file)))))}) + (swap! *state assoc :status "Loading filters...") + (with-open [reader (io/reader file)] + (doseq [line (line-seq reader)] + (swap! filters conj line)) + (swap! *state assoc :status (format "Filters loaded! %s emails/links will be excluded" (count @filters)))))))}) (defn root "The root app that glues all the components together" diff --git a/src/mail_harvester/scraper.clj b/src/mail_harvester/scraper.clj index 7b8a364..4c8733c 100644 --- a/src/mail_harvester/scraper.clj +++ b/src/mail_harvester/scraper.clj @@ -7,39 +7,46 @@ (defn fetch-emails "Fetches the mailto: links in a website, removes the mailto: prefix and returns a list of emails" - [driver] + [driver filters] (for [rawemail (e/query-all driver {:css "a[href^=\"mailto:\"]"})] - ;; Replace instances of "mailto:" with an empty string - (string/replace (e/get-element-attr-el driver rawemail :href) - #"mailto:" - ""))) + (let [email (string/replace (e/get-element-attr-el driver rawemail :href) + #"mailto:" + "")] + (if (empty? filters) + email + (if (= (.contains filters email) false) + email))))) (defn fetch-links "Fetches any links in a website and returns a list of links" - [driver] + [driver filters] (for [rawlink (e/query-all driver {:css "a"})] ;; Return the raw link from the href attribute - (e/get-element-attr-el driver rawlink :href))) + (let [link (e/get-element-attr-el driver rawlink :href)] + (if (empty? filters) + link + (if (= (.contains filters link) false) + link))))) (defn scrape-url "Scrapes from the URL, with information on which browser to use and what action to perform" - [url browser action] + [url browser filters action] (cond (= browser "Firefox") (let [driver (e/firefox-headless {:path-driver "./drivers/geckodriver"})] (e/go driver url) - (cond (= action "emails") (fetch-emails driver) - (= action "links") (fetch-links driver) + (cond (= action "emails") (fetch-emails driver filters) + (= action "links") (fetch-links driver filters) :else (println (format "Unknown action \"%s\" ignored" action)))) (= browser "Chrome") (let [driver (e/chrome-headless {:path-driver "./drivers/chromedriver"})] (e/go driver url) - (cond (= action "emails") (fetch-emails driver) - (= action "links") (fetch-links driver) + (cond (= action "emails") (fetch-emails driver filters) + (= action "links") (fetch-links driver filters) :else (println (format "Unknown action \"%s\" ignored" action)))) (= browser "Safari") (let [driver (e/safari)] (e/go driver url) - (cond (= action "emails") (fetch-emails driver) - (= action "links") (fetch-links driver) - :else (println (format "Unknown action \"%s\" ignored" action)))) + (cond (= action "emails") (fetch-emails driver filters) + (= action "links") (fetch-links driver filters) + :else (println (format "Unknown action \"%s\" ignored" action)))) :else (throw (Exception. (str "Browser is not valid: " browser))))) (defn write-to-exports