Skip to content

Commit

Permalink
Add the ability to scope the fetched links
Browse files Browse the repository at this point in the history
  • Loading branch information
fredwu committed Oct 10, 2023
1 parent abe5ff6 commit bbac696
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 49 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## master

- [Added] Add `:force` option
- [Added] Add `:scope` option

## v1.4.0 [2023-10-07]

- [Added] Allow multiple instances of Crawler sharing the same queue
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ There are several ways to access the crawled page data:
| `:max_pages` | integer | `:infinity` | Maximum amount of pages to crawl. |
| `:timeout` | integer | `5000` | Timeout value for fetching a page, in ms. Can also be set to `:infinity`, useful when combined with `Crawler.pause/1`. |
| `:store` | module | `nil` | Module for storing the crawled page data and crawling metadata, defaults to `nil`. You can also set it to `Crawler.Store` or your own, see `Crawler.Store.add_page_data/3` for implementation details. |
| `:force` | boolean | `false` | Force crawling URLs even if they have already been crawled, useful if you want to refresh the crawled data. |
| `:scope` | term | `nil` | Similar to `:force`, but you can pass a custom `:scope` to determine how Crawler should perform on links already seen. |
| `:user_agent` | string | `Crawler/x.x.x (...)` | User-Agent value sent by the fetch requests. |
| `:url_filter` | module | `Crawler.Fetcher.UrlFilter` | Custom URL filter, useful for restricting crawlable domains, paths or content types. |
| `:retrier` | module | `Crawler.Fetcher.Retrier` | Custom fetch retrier, useful for retrying failed crawls. |
Expand Down
1 change: 1 addition & 0 deletions lib/crawler.ex
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ defmodule Crawler do
opts
|> Enum.into(%{})
|> Options.assign_defaults()
|> Options.assign_scope()
|> Options.assign_url(url)

if Store.ops_count() < opts[:max_pages] do
Expand Down
4 changes: 2 additions & 2 deletions lib/crawler/fetcher/policer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ defmodule Crawler.Fetcher.Policer do

defp acceptable_uri_scheme?(_opts), do: {:acceptable_uri_scheme?, true}

defp not_fetched_yet?(%{url: url} = _opts) do
{:not_fetched_yet?, !Store.find(url)}
defp not_fetched_yet?(%{url: url, scope: scope} = _opts) do
{:not_fetched_yet?, !Store.find({url, scope})}
end

defp not_fetched_yet?(_opts), do: {:not_fetched_yet?, true}
Expand Down
10 changes: 5 additions & 5 deletions lib/crawler/fetcher/recorder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ defmodule Crawler.Fetcher.Recorder do
{:ok, %{depth: 3, url: "url1"}}
iex> Recorder.record(url: "url2", depth: 2)
iex> Store.find("url2")
iex> Store.find({"url2", nil})
%Page{url: "url2"}
"""
def record(opts) do
Expand All @@ -35,19 +35,19 @@ defmodule Crawler.Fetcher.Recorder do
{:ok, nil}
iex> Recorder.record(url: "url", depth: 2)
iex> Recorder.maybe_store_page("body", %{store: Store, url: "url"})
{:ok, {%Page{url: "url", body: "body", opts: %{store: Store, url: "url"}}, %Page{url: "url", body: nil}}}
iex> Recorder.maybe_store_page("body", %{store: Store, url: "url", scope: nil})
{:ok, {%Page{url: "url", body: "body", opts: %{store: Store, url: "url", scope: nil}}, %Page{url: "url", body: nil}}}
"""
def maybe_store_page(_body, %{store: nil} = _opts) do
{:ok, nil}
end

def maybe_store_page(body, opts) do
{:ok, opts[:store].add_page_data(opts[:url], body, opts)}
{:ok, opts[:store].add_page_data({opts[:url], opts[:scope]}, body, opts)}
end

defp store_url(opts) do
Store.add(opts[:url])
Store.add({opts[:url], opts[:scope]})
end

defp store_url_depth(opts) do
Expand Down
12 changes: 12 additions & 0 deletions lib/crawler/options.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ defmodule Crawler.Options do
@max_pages :infinity
@timeout 5_000
@store nil
@force false
@scope nil
@user_agent "Crawler/#{Mixfile.project()[:version]} (https://github.com/fredwu/crawler)"
@url_filter Crawler.Fetcher.UrlFilter
@retrier Crawler.Fetcher.Retrier
Expand Down Expand Up @@ -49,6 +51,8 @@ defmodule Crawler.Options do
max_pages: max_pages(),
timeout: timeout(),
store: store(),
force: force(),
scope: scope(),
user_agent: user_agent(),
url_filter: url_filter(),
retrier: retrier(),
Expand Down Expand Up @@ -84,6 +88,12 @@ defmodule Crawler.Options do
Map.merge(opts, %{url: url})
end

def assign_scope(%{force: true, scope: nil} = opts) do
Map.merge(opts, %{scope: System.unique_integer()})
end

def assign_scope(opts), do: opts

defp assets, do: Application.get_env(:crawler, :assets, @assets)
defp save_to, do: Application.get_env(:crawler, :save_to, @save_to)
defp workers, do: Application.get_env(:crawler, :workers, @workers)
Expand All @@ -92,6 +102,8 @@ defmodule Crawler.Options do
defp max_pages, do: Application.get_env(:crawler, :max_pages, @max_pages)
defp timeout, do: Application.get_env(:crawler, :timeout, @timeout)
defp store, do: Application.get_env(:crawler, :store, @store)
defp force, do: Application.get_env(:crawler, :force, @force)
defp scope, do: Application.get_env(:crawler, :scope, @scope)
defp user_agent, do: Application.get_env(:crawler, :user_agent, @user_agent)
defp url_filter, do: Application.get_env(:crawler, :url_filter, @url_filter)
defp retrier, do: Application.get_env(:crawler, :retrier, @retrier)
Expand Down
20 changes: 10 additions & 10 deletions lib/crawler/store.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ defmodule Crawler.Store do
@doc """
Finds a stored URL and returns its page data.
"""
def find(url) do
case Registry.lookup(DB, url) do
def find({url, scope}) do
case Registry.lookup(DB, {url, scope}) do
[{_, page}] -> page
_ -> nil
end
Expand All @@ -41,8 +41,8 @@ defmodule Crawler.Store do
@doc """
Finds a stored URL and returns its page data only if it's processed.
"""
def find_processed(url) do
case Registry.match(DB, url, %{processed: true}) do
def find_processed({url, scope}) do
case Registry.match(DB, {url, scope}, %{processed: true}) do
[{_, page}] -> page
_ -> nil
end
Expand All @@ -51,22 +51,22 @@ defmodule Crawler.Store do
@doc """
Adds a URL to the registry.
"""
def add(url) do
Registry.register(DB, url, %Page{url: url})
def add({url, scope}) do
Registry.register(DB, {url, scope}, %Page{url: url})
end

@doc """
Adds the page data for a URL to the registry.
"""
def add_page_data(url, body, opts) do
{_new, _old} = Registry.update_value(DB, url, &%{&1 | body: body, opts: opts})
def add_page_data({url, scope}, body, opts) do
{_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | body: body, opts: opts})
end

@doc """
Marks a URL as processed in the registry.
"""
def processed(url) do
{_new, _old} = Registry.update_value(DB, url, &%{&1 | processed: true})
def processed({url, scope}) do
{_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | processed: true})
end

def all_urls do
Expand Down
5 changes: 4 additions & 1 deletion lib/crawler/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ defmodule Crawler.Worker do
{:noreply, state}
end

defp mark_processed({:ok, %Page{url: url}}), do: Store.processed(url)
defp mark_processed({:ok, %Page{url: url, opts: opts}}) do
Store.processed({url, opts[:scope]})
end

defp mark_processed(_), do: nil
end
4 changes: 2 additions & 2 deletions test/lib/crawler/fetcher/policer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ defmodule Crawler.Fetcher.PolicerTest do
end

test "fetched error" do
Crawler.Store.add("http://policer/exist/")
Crawler.Store.add({"http://policer/exist/", nil})

assert {:warn, "Fetch failed check 'not_fetched_yet?', with opts: " <> _} =
Policer.police(%{url: "http://policer/exist/"})
Policer.police(%{url: "http://policer/exist/", scope: nil})
end
end
8 changes: 4 additions & 4 deletions test/lib/crawler/fetcher_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ defmodule Crawler.FetcherTest do
|> Map.merge(%{url: url})
|> Fetcher.fetch()

page = Store.find(url)
page = Store.find({url, nil})

assert page.url == url
assert page.body == "<html>200</html>"
Expand All @@ -56,7 +56,7 @@ defmodule Crawler.FetcherTest do
|> Fetcher.fetch()

assert fetcher == {:warn, "Failed to fetch #{url}, status code: 500"}
refute Store.find(url).body
refute Store.find({url, nil}).body
end

test "failure: timeout", %{bypass: bypass, url: url} do
Expand All @@ -74,7 +74,7 @@ defmodule Crawler.FetcherTest do
|> Fetcher.fetch()

assert fetcher == {:warn, "Failed to fetch #{url}, reason: :timeout"}
refute Store.find(url).body
refute Store.find({url, nil}).body
end

test "failure: retries", %{bypass: bypass, url: url} do
Expand All @@ -91,7 +91,7 @@ defmodule Crawler.FetcherTest do
|> Fetcher.fetch()

assert fetcher == {:warn, "Failed to fetch #{url}, status code: 500"}
refute Store.find(url).body
refute Store.find({url, nil}).body
end)
end

Expand Down
97 changes: 72 additions & 25 deletions test/lib/crawler_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,20 @@ defmodule CrawlerTest do
end)

wait(fn ->
assert %Store.Page{url: ^url, opts: %{workers: 3}} = Store.find_processed(url)
assert %Store.Page{url: ^url, opts: %{workers: 3}} = Store.find_processed({url, nil})

assert Store.find_processed(linked_url1)
assert Store.find_processed(linked_url2)
assert Store.find_processed(linked_url3)
refute Store.find(linked_url4)
assert Store.find_processed({linked_url1, nil})
assert Store.find_processed({linked_url2, nil})
assert Store.find_processed({linked_url3, nil})
refute Store.find({linked_url4, nil})

urls = Crawler.Store.all_urls()

assert Enum.member?(urls, url)
assert Enum.member?(urls, linked_url1)
assert Enum.member?(urls, linked_url2)
assert Enum.member?(urls, linked_url3)
refute Enum.member?(urls, linked_url4)
assert Enum.member?(urls, {url, nil})
assert Enum.member?(urls, {linked_url1, nil})
assert Enum.member?(urls, {linked_url2, nil})
assert Enum.member?(urls, {linked_url3, nil})
refute Enum.member?(urls, {linked_url4, nil})
end)

wait(fn ->
Expand All @@ -92,7 +92,7 @@ defmodule CrawlerTest do
{:ok, opts} = Crawler.crawl(url, max_depths: 1, workers: 1, interval: 100)

wait(fn ->
assert %Store.Page{url: ^url, body: nil, opts: nil} = Store.find_processed(url)
assert %Store.Page{url: ^url, body: nil, opts: nil} = Store.find_processed({url, nil})
end)

wait(fn ->
Expand Down Expand Up @@ -135,11 +135,11 @@ defmodule CrawlerTest do
end)

wait(fn ->
assert Store.find_processed(url)
assert Store.find_processed(linked_url1)
assert Store.find_processed(linked_url2)
refute Store.find(linked_url3)
refute Store.find(linked_url4)
assert Store.find_processed({url, nil})
assert Store.find_processed({linked_url1, nil})
assert Store.find_processed({linked_url2, nil})
refute Store.find({linked_url3, nil})
refute Store.find({linked_url4, nil})
end)

wait(fn ->
Expand Down Expand Up @@ -184,17 +184,17 @@ defmodule CrawlerTest do
end)

wait(fn ->
assert Store.find_processed(linked_url1)
assert Store.find_processed(linked_url2)
assert Store.find_processed(linked_url3)
refute Store.find_processed(linked_url4)
assert Store.find_processed({linked_url1, nil})
assert Store.find_processed({linked_url2, nil})
assert Store.find_processed({linked_url3, nil})
refute Store.find_processed({linked_url4, nil})

urls = Crawler.Store.all_urls()

assert Enum.member?(urls, linked_url1)
assert Enum.member?(urls, linked_url2)
assert Enum.member?(urls, linked_url3)
refute Enum.member?(urls, linked_url4)
assert Enum.member?(urls, {linked_url1, nil})
assert Enum.member?(urls, {linked_url2, nil})
assert Enum.member?(urls, {linked_url3, nil})
refute Enum.member?(urls, {linked_url4, nil})
end)

wait(fn ->
Expand All @@ -203,6 +203,53 @@ defmodule CrawlerTest do
end)
end

test ".crawl forced", %{bypass: bypass, url: url} do
Store.ops_reset()

url = "#{url}/crawler_forced"
linked_url1 = "#{url}/link1"
linked_url2 = "#{url}/link2"

Bypass.expect(bypass, "GET", "/crawler_forced", fn conn ->
Plug.Conn.resp(conn, 200, """
<html><a href="#{linked_url1}">1</a></html>
<html><a href="#{linked_url1}">1</a></html>
""")
end)

Bypass.expect(bypass, "GET", "/crawler_forced/link1", fn conn ->
Plug.Conn.resp(conn, 200, """
<html><a id="link2" href="#{linked_url2}" target="_blank">2</a></html>
""")
end)

Bypass.expect(bypass, "GET", "/crawler_forced/link2", fn conn ->
Plug.Conn.resp(conn, 200, """
<html>ok</html>
""")
end)

{:ok, opts1} = Crawler.crawl(url, force: true, workers: 1, interval: 100)
{:ok, opts2} = Crawler.crawl(url, force: true, workers: 2, interval: 100)

refute opts1[:scope] == opts2[:scope]

wait(fn ->
assert Store.find_processed({url, opts1[:scope]})
assert Store.find_processed({url, opts2[:scope]})
assert Store.find_processed({linked_url1, opts1[:scope]})
assert Store.find_processed({linked_url1, opts2[:scope]})
assert Store.find_processed({linked_url2, opts1[:scope]})
assert Store.find_processed({linked_url2, opts2[:scope]})

assert Store.ops_count() >= 6
assert Store.ops_count() <= 10

assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
end)
end

test ".crawl stopped", %{bypass: bypass, url: url} do
url = "#{url}/stop"
linked_url = "#{url}/stop1"
Expand All @@ -219,6 +266,6 @@ defmodule CrawlerTest do

Crawler.stop(opts)

refute Store.find(linked_url)
refute Store.find({linked_url, nil})
end
end

0 comments on commit bbac696

Please sign in to comment.