Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Commit

Permalink
Improve license URL validation (#1028)
Browse files Browse the repository at this point in the history
* Improve license URL validation

* Use the actual URL instead of the part

* Fix incorrect jamendo test

* Add tests

_get_license_url adds a trailing slash and does not make an internet request for known licenses

Signed-off-by: Olga Bulat <[email protected]>
  • Loading branch information
obulat authored Mar 10, 2023
1 parent 261538c commit 4c0ceb0
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 8 deletions.
16 changes: 13 additions & 3 deletions openverse_catalog/dags/common/licenses/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

LICENSE_PATH_MAP = constants.get_license_path_map()
REVERSE_LICENSE_PATH_MAP = constants.get_reverse_license_path_map()
CC_BASE_URL = "https://creativecommons.org/"
LICENSE_URLS = {f"{CC_BASE_URL}{l}/" for l in LICENSE_PATH_MAP.keys()} # noqa: E741


class InvalidLicenseURLException(Exception):
Expand Down Expand Up @@ -143,21 +145,29 @@ def _get_valid_cc_url(license_url) -> str | None:
This function enforces:
- string type
- https scheme
- parses into a urllib.parse.ParseResult with
netloc=creativecommons.org
- trailing slash
If the resulting URL is in the `LICENSE_URLS` set, we return it.
Otherwise, we parse URL into a urllib.parse.ParseResult, ensuring
that its netloc=creativecommons.org
After that, we rewrite the URL to whatever we get redirected to when
we make a request using it.
If all of these validations and the rewriting succeed, we return the
rewritten URL. Otherwise, we return None
rewritten URL. Otherwise, we return None.
"""
logger.debug(f"Checking license URL {license_url}")
if type(license_url) != str:
logger.debug(f"License URL is not a string. Type is {type(license_url)}")
return

https_url = urls.add_url_scheme(license_url.lower(), "https")
if not https_url.endswith("/"):
https_url += "/"
if https_url in LICENSE_URLS:
return https_url

parsed_url = urlparse(https_url)

if parsed_url.netloc != "creativecommons.org":
Expand Down
22 changes: 18 additions & 4 deletions tests/dags/common/licenses/test_licenses.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from unittest.mock import patch

import common.urls
import pytest
Expand Down Expand Up @@ -150,20 +151,33 @@ def test_get_license_info_from_url_with_license_url_path_mismatch(
assert all([i is None for i in license_info])


def test_get_license_info_from_url_with_good_license_url(mock_cc_url_validator):
def test_get_license_info_from_url_with_good_license_url():
expected_license, expected_version = "cc0", "1.0"
license_url = "https://creativecommons.org/publicdomain/zero/1.0/"
path_map = {"publicdomain/zero/1.0": ("cc0", "1.0")}
actual_license_info = licenses._get_license_info_from_url(
license_url, path_map=path_map
)

with patch.object(licenses.urls, "rewrite_redirected_url") as mock_rewriter:
actual_license_info = licenses._get_license_info_from_url(
license_url, path_map=path_map
)
expected_license_info = (
expected_license,
expected_version,
license_url,
license_url,
)
assert actual_license_info == expected_license_info
assert mock_rewriter.call_count == 0


def test_get_valid_cc_url_adds_a_trailing_slash():
license_url = "https://creativecommons.org/licenses/by-nc-nd/2.0"

with patch.object(licenses.urls, "rewrite_redirected_url") as mock_rewriter:
actual_license_url = licenses._get_valid_cc_url(license_url)

assert mock_rewriter.call_count == 0
assert actual_license_url == f"{license_url}/"


def test_get_license_info_from_license_pair_nones_when_missing_license(mock_rewriter):
Expand Down
2 changes: 1 addition & 1 deletion tests/dags/providers/provider_api_scripts/test_jamendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_get_record_data():
"license_info": LicenseInfo(
license="by-nc",
version="2.0",
url="https://creativecommons.org/licenses/by-nc/2.0",
url="https://creativecommons.org/licenses/by-nc/2.0/",
raw_url="http://creativecommons.org/licenses/by-nc/2.0/",
),
"meta_data": {
Expand Down

0 comments on commit 4c0ceb0

Please sign in to comment.