From 2a41126b891f7489073310630ad2034554db3349 Mon Sep 17 00:00:00 2001 From: rooot Date: Sun, 17 Nov 2024 23:25:26 +0100 Subject: [PATCH 1/2] feat: more rebust copyright parsing intends to fix #126 Signed-off-by: rooot --- apod/utility.py | 57 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/apod/utility.py b/apod/utility.py index 3d57eac..64abf3d 100644 --- a/apod/utility.py +++ b/apod/utility.py @@ -179,6 +179,7 @@ def _copyright(soup): for element in soup.findAll('a', text=True): # LOG.debug("TEXT: "+element.text) + # TODO: this breaks for APODs like 2024-11-09, where it credits "Voyager" instead of "Voyager 2" if use_next: copyright_text = element.text.strip(' ') break @@ -188,23 +189,69 @@ def _copyright(soup): use_next = True if not copyright_text: + LOG.debug("didn't find copyright using first method!") for element in soup.findAll(['b', 'a'], text=True): # search text for explicit match - if 'Copyright' in element.text: - LOG.debug('Found Copyright text:' + str(element.text)) + if 'Copyright' in element.text or 'Image Credit' in element.text: + LOG.debug('Found Potential Copyright text:' + str(element.text)) # pull the copyright from the link text which follows sibling = element.next_sibling stuff = "" + + # these are used for checking when to add attribution + # if an image contains no direct copyright/license mentions AND credits NASA, + # we can assume the image is public domain - if not, we add the attribution + found_license_mention = False + found_nasa_credit = False + while sibling: try: - stuff = stuff + sibling.text - except Exception: + # clean up the text a bit and get rid of double spaces + sibling_text = sibling.text.replace('\n', ' ').replace(' ', ' ') + + # LOG.debug("!!! adding1: |" + sibling_text + "|") + stuff = stuff + sibling_text + + if sibling_text.lower().strip(' ') == "nasa": + found_nasa_credit = True + LOG.debug(">> found NASA credit!") + + # handle edge cases for licenses and copyright. might not work for all cases yet + if "license" in sibling_text.lower() or "copyright" in sibling_text.lower(): + LOG.debug(">> found license mention!") + found_license_mention = True + for link in sibling.findAll('a', text=True): + LOG.debug("LINK:" + str(link)) + + if "license" in link.text.lower() or "copyright" in link.text.lower(): + LOG.debug("License link: |" + str(link) + "| from |" + str(sibling_text) + "|") + LOG.debug("stuff before: |" + stuff + "|") + + # adding license link - clean up the URL and text, just in case + clean_link = link["href"].strip('\n').strip(' ') + license = clean_link + " " + link.text.strip('\n').strip(' ') + LOG.debug("license info:" + license) + # make license prettier if we can by checking for the type of license + # todo: add more licenses, maybe? + if "creativecommons.org/licenses/by/2.0" in license: + license = "CC-BY-2.0" + + LOG.debug("!!! adding: |" + license + "|") + stuff = stuff + " " + license + + except Exception as ex: + LOG.warning("exception in copyright handler (sibling): " + str(ex)) pass sibling = sibling.next_sibling if stuff: - copyright_text = stuff.strip(' ') + if not found_license_mention and found_nasa_credit: + LOG.debug("image is likely public domain - explicit NASA credit and no license/copyright mentions found") + copyright_text = None + else: + # LOG.debug("found license or copyright") + copyright_text = stuff.strip(' ').replace(' ', ' ') try: copyright_text = copyright_text.encode('latin1').decode('cp1252') except Exception as ex: From 37db239c035067322fb4cd2e27c6e119a7fe6a62 Mon Sep 17 00:00:00 2001 From: rooot Date: Sun, 17 Nov 2024 23:35:57 +0100 Subject: [PATCH 2/2] fix: tests and add new test case Signed-off-by: rooot --- tests/apod/test_service.py | 2 +- tests/apod/test_utility.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/apod/test_service.py b/tests/apod/test_service.py index 8d96f73..b6ddb91 100644 --- a/tests/apod/test_service.py +++ b/tests/apod/test_service.py @@ -12,5 +12,5 @@ class TestPageNotFound(unittest.TestCase): def test(self, mock_abort): GIVEN = Exception('example exception') - applicaiton.page_not_found(GIVEN) + application.page_not_found(GIVEN) mock_abort.assert_called_once() diff --git a/tests/apod/test_utility.py b/tests/apod/test_utility.py index ed17e7c..0bb59d1 100644 --- a/tests/apod/test_utility.py +++ b/tests/apod/test_utility.py @@ -10,7 +10,19 @@ class TestApod(unittest.TestCase): """Test the extraction of APOD characteristics.""" - TEST_DATA = { + TEST_DATA = { + 'normal page (2024), copyright' : + { + "datetime": datetime(2024, 11, 17), + "copyright": 'Hubble, NASA, ESA; Processing & License: CC-BY-2.0 Judy Schmidt', + "date": "2024-11-17", + "explanation": "What is the cause of this unusual parabolic structure? This illuminated cavity, known as LDN 1471, was created by a newly forming star, seen as the bright source at the peak of the parabola. This protostar is experiencing a stellar outflow which is then interacting with the surrounding material in the Perseus Molecular Cloud, causing it to brighten. We see only one side of the cavity -- the other side is hidden by dark dust. The parabolic shape is caused by the widening of the stellar-wind blown cavity over time. Two additional structures can also be seen either side of the protostar; these are known as Herbig-Haro objects, again caused by the interaction of the outflow with the surrounding material. What causes the striations on the cavity walls, though, remains unknown. The featured image was taken by NASA and ESA’s Hubble Space Telescope after an original detection by the Spitzer Space Telescope. Explore Your Universe: Random APOD Generator", + "hdurl": "https://apod.nasa.gov/apod/image/2411/LDN1471_HubbleSchmidt_1024.jpg", + "media_type": "image", + "service_version": "v1", + "title": "LDN 1471: A Windblown Star Cavity", + "url": "https://apod.nasa.gov/apod/image/2411/LDN1471_HubbleSchmidt_960.jpg", + }, 'normal page, copyright' : { "datetime": datetime(2017, 3, 22), @@ -51,7 +63,7 @@ class TestApod(unittest.TestCase): { "datetime": datetime(2013, 3, 11), # this illustrates problematic, but still functional parsing of the copyright - "copyright": 'Martin RietzeAlien Landscapes on Planet Earth', + "copyright": 'Martin Rietze (Alien Landscapes on Planet Earth)', "date": "2013-03-11", "explanation": "Why does a volcanic eruption sometimes create lightning? Pictured above, the Sakurajima volcano in southern Japan was caught erupting in early January. Magma bubbles so hot they glow shoot away as liquid rock bursts through the Earth's surface from below. The above image is particularly notable, however, for the lightning bolts caught near the volcano's summit. Why lightning occurs even in common thunderstorms remains a topic of research, and the cause of volcanic lightning is even less clear. Surely, lightning bolts help quench areas of opposite but separated electric charges. One hypothesis holds that catapulting magma bubbles or volcanic ash are themselves electrically charged, and by their motion create these separated areas. Other volcanic lightning episodes may be facilitated by charge-inducing collisions in volcanic dust. Lightning is usually occurring somewhere on Earth, typically over 40 times each second.", "hdurl": "https://apod.nasa.gov/apod/image/1303/volcano_reitze_1280.jpg",