Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more rebust copyright parsing #127

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 52 additions & 5 deletions apod/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def _copyright(soup):
for element in soup.findAll('a', text=True):
# LOG.debug("TEXT: "+element.text)

# TODO: this breaks for APODs like 2024-11-09, where it credits "Voyager" instead of "Voyager 2"
if use_next:
copyright_text = element.text.strip(' ')
break
Expand All @@ -188,23 +189,69 @@ def _copyright(soup):
use_next = True

if not copyright_text:
LOG.debug("didn't find copyright using first method!")

for element in soup.findAll(['b', 'a'], text=True):
# search text for explicit match
if 'Copyright' in element.text:
LOG.debug('Found Copyright text:' + str(element.text))
if 'Copyright' in element.text or 'Image Credit' in element.text:
LOG.debug('Found Potential Copyright text:' + str(element.text))
# pull the copyright from the link text which follows
sibling = element.next_sibling
stuff = ""

# these are used for checking when to add attribution
# if an image contains no direct copyright/license mentions AND credits NASA,
# we can assume the image is public domain - if not, we add the attribution
found_license_mention = False
found_nasa_credit = False

while sibling:
try:
stuff = stuff + sibling.text
except Exception:
# clean up the text a bit and get rid of double spaces
sibling_text = sibling.text.replace('\n', ' ').replace(' ', ' ')

# LOG.debug("!!! adding1: |" + sibling_text + "|")
stuff = stuff + sibling_text

if sibling_text.lower().strip(' ') == "nasa":
found_nasa_credit = True
LOG.debug(">> found NASA credit!")

# handle edge cases for licenses and copyright. might not work for all cases yet
if "license" in sibling_text.lower() or "copyright" in sibling_text.lower():
LOG.debug(">> found license mention!")
found_license_mention = True
for link in sibling.findAll('a', text=True):
LOG.debug("LINK:" + str(link))

if "license" in link.text.lower() or "copyright" in link.text.lower():
LOG.debug("License link: |" + str(link) + "| from |" + str(sibling_text) + "|")
LOG.debug("stuff before: |" + stuff + "|")

# adding license link - clean up the URL and text, just in case
clean_link = link["href"].strip('\n').strip(' ')
license = clean_link + " " + link.text.strip('\n').strip(' ')
LOG.debug("license info:" + license)
# make license prettier if we can by checking for the type of license
# todo: add more licenses, maybe?
if "creativecommons.org/licenses/by/2.0" in license:
license = "CC-BY-2.0"

LOG.debug("!!! adding: |" + license + "|")
stuff = stuff + " " + license

except Exception as ex:
LOG.warning("exception in copyright handler (sibling): " + str(ex))
pass
sibling = sibling.next_sibling

if stuff:
copyright_text = stuff.strip(' ')
if not found_license_mention and found_nasa_credit:
LOG.debug("image is likely public domain - explicit NASA credit and no license/copyright mentions found")
copyright_text = None
else:
# LOG.debug("found license or copyright")
copyright_text = stuff.strip(' ').replace(' ', ' ')
try:
copyright_text = copyright_text.encode('latin1').decode('cp1252')
except Exception as ex:
Expand Down
2 changes: 1 addition & 1 deletion tests/apod/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
class TestPageNotFound(unittest.TestCase):
def test(self, mock_abort):
GIVEN = Exception('example exception')
applicaiton.page_not_found(GIVEN)
application.page_not_found(GIVEN)
mock_abort.assert_called_once()
16 changes: 14 additions & 2 deletions tests/apod/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,19 @@
class TestApod(unittest.TestCase):
"""Test the extraction of APOD characteristics."""

TEST_DATA = {
TEST_DATA = {
'normal page (2024), copyright' :
{
"datetime": datetime(2024, 11, 17),
"copyright": 'Hubble, NASA, ESA; Processing & License: CC-BY-2.0 Judy Schmidt',
"date": "2024-11-17",
"explanation": "What is the cause of this unusual parabolic structure? This illuminated cavity, known as LDN 1471, was created by a newly forming star, seen as the bright source at the peak of the parabola. This protostar is experiencing a stellar outflow which is then interacting with the surrounding material in the Perseus Molecular Cloud, causing it to brighten. We see only one side of the cavity -- the other side is hidden by dark dust. The parabolic shape is caused by the widening of the stellar-wind blown cavity over time. Two additional structures can also be seen either side of the protostar; these are known as Herbig-Haro objects, again caused by the interaction of the outflow with the surrounding material. What causes the striations on the cavity walls, though, remains unknown. The featured image was taken by NASA and ESA’s Hubble Space Telescope after an original detection by the Spitzer Space Telescope. Explore Your Universe: Random APOD Generator",
"hdurl": "https://apod.nasa.gov/apod/image/2411/LDN1471_HubbleSchmidt_1024.jpg",
"media_type": "image",
"service_version": "v1",
"title": "LDN 1471: A Windblown Star Cavity",
"url": "https://apod.nasa.gov/apod/image/2411/LDN1471_HubbleSchmidt_960.jpg",
},
'normal page, copyright' :
{
"datetime": datetime(2017, 3, 22),
Expand Down Expand Up @@ -51,7 +63,7 @@ class TestApod(unittest.TestCase):
{
"datetime": datetime(2013, 3, 11),
# this illustrates problematic, but still functional parsing of the copyright
"copyright": 'Martin RietzeAlien Landscapes on Planet Earth',
"copyright": 'Martin Rietze (Alien Landscapes on Planet Earth)',
"date": "2013-03-11",
"explanation": "Why does a volcanic eruption sometimes create lightning? Pictured above, the Sakurajima volcano in southern Japan was caught erupting in early January. Magma bubbles so hot they glow shoot away as liquid rock bursts through the Earth's surface from below. The above image is particularly notable, however, for the lightning bolts caught near the volcano's summit. Why lightning occurs even in common thunderstorms remains a topic of research, and the cause of volcanic lightning is even less clear. Surely, lightning bolts help quench areas of opposite but separated electric charges. One hypothesis holds that catapulting magma bubbles or volcanic ash are themselves electrically charged, and by their motion create these separated areas. Other volcanic lightning episodes may be facilitated by charge-inducing collisions in volcanic dust. Lightning is usually occurring somewhere on Earth, typically over 40 times each second.",
"hdurl": "https://apod.nasa.gov/apod/image/1303/volcano_reitze_1280.jpg",
Expand Down