Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
angelogladding authored Nov 30, 2023
2 parents a9cb118 + cf322c4 commit 1786227
Show file tree
Hide file tree
Showing 12 changed files with 283 additions and 51 deletions.
19 changes: 12 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Change Log
All notable changes to this project will be documented in this file.

## 2.0 - unreleased
- make relative URLs in e-* properties absolute (#201)
- fix whitespace in plaintext conversion (#207)
- add srcset support (#209)

## 1.1.3 - 2023-06-28
- reduce instances where photo is implied (#135)
- always do relative URL resolution (#138)
Expand All @@ -19,7 +24,7 @@ All notable changes to this project will be documented in this file.
## 1.1.1 - 2018-06-15

- streamline backcompat to use JSON only.
- fix multiple mf1 root rel-tag parsing
- fix multiple mf1 root rel-tag parsing
- correct url and photo for hreview.
- add rules for nested hreview. update backcompat to use multiple matches in old properties.
- fix `rel-tag` to `p-category` conversion so that other classes are not lost.
Expand All @@ -34,11 +39,11 @@ All notable changes to this project will be documented in this file.
- better whitespace algorithm for `name` and `html.value` parsing
- experimental flag for including `alt` in `u-photo` parsing
- make a copy of the BeautifulSoup given by user to work on for parsing to prevent changes to original doc
- bump version to 1.1.1
- bump version to 1.1.1

## 1.1.0 - 2018-03-16

- bump version to 1.1.0 since it is a "major" change
- bump version to 1.1.0 since it is a "major" change
- added tests for new implied name rules
- modified earlier tests to accommodate new rules
- use space separator instead of "T"
Expand All @@ -56,12 +61,12 @@ All notable changes to this project will be documented in this file.
## 1.0.6 - 2018-03-04

- strip leading/trailing white space for `e-*[html]`. update the corresponding tests
- blank values explicitly authored are allowed as property values
- blank values explicitly authored are allowed as property values
- include `alt` or `src` from `<img>` in parsing for `p-*` and `e-*[value]`
- parse `title` from `<link>` for `p-*` resolves #84
- and `poster` from `<video>` for `u-*` resolves #76
- parse `title` from `<link>` for `p-*` resolves #84
- and `poster` from `<video>` for `u-*` resolves #76
- use `html5lib` as default parser
- use the final redirect URL resolves #62
- use the final redirect URL resolves #62
- update requirements to use BS4 v4.6.0 and html5lib v1.0.1
- drop support for Python 2.6 as html5lib dropped support

Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PRs must pass all tests and linting requirements before they can be merged.
Before you submit a PR to `mf2py`, run the following command in the base directory of the project:

```bash
make style
make lint
```

This will format your code using the linters configured with the project.
Expand Down
48 changes: 32 additions & 16 deletions mf2py/dom_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,38 @@ def get_attr(el, attr, check_name=None):
return el.get(attr)


def get_img_src_alt(img, base_url=""):
"""given a img element, returns both src and alt attributes as a list of tuples if alt exists, else returns the src as a string
use for alt parsing with img
"""

alt = get_attr(img, "alt", check_name="img")
def parse_srcset(srcset, base_url):
"""Return a dictionary of sources found in srcset."""
sources = {}
for url, descriptor in re.findall(
r"(\S+)\s*([\d.]+[xw])?\s*,?\s*",
srcset,
re.MULTILINE,
):
if not descriptor:
descriptor = "1x"
if descriptor not in sources:
sources[descriptor] = try_urljoin(base_url, url.strip(","))
return sources


def get_img(img, base_url):
"""Return a dictionary with src and alt/srcset if present, else just string src."""
src = get_attr(img, "src", check_name="img")

if src is not None:
src = try_urljoin(base_url, src)

if alt is None:
return src
else:
return {"value": src, "alt": alt}
if src is None:
return
src = try_urljoin(base_url, src)
alt = get_attr(img, "alt", check_name="img")
srcset = get_attr(img, "srcset", check_name="img")
if alt is not None or srcset:
prop_value = {"value": src}
if alt is not None:
prop_value["alt"] = alt
if srcset:
prop_value["srcset"] = parse_srcset(srcset, base_url)
return prop_value
else:
return src


def get_children(node):
Expand Down Expand Up @@ -119,8 +136,7 @@ def text_collection(el, replace_img=False, img_to_src=True, base_url=""):
items.extend(child_items)

if el.name == "p":
items = [P_BREAK_BEFORE] + items
items.append(P_BREAK_AFTER)
items = [P_BREAK_BEFORE] + items + [P_BREAK_AFTER, "\n"]

return items

Expand Down
12 changes: 3 additions & 9 deletions mf2py/implied_properties.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
from . import mf2_classes
from .dom_helpers import (
get_attr,
get_children,
get_img_src_alt,
get_textContent,
try_urljoin,
)
from .dom_helpers import get_attr, get_children, get_img, get_textContent, try_urljoin


def name(el, base_url=""):
Expand Down Expand Up @@ -111,7 +105,7 @@ def resolve_relative_url(prop_value):
return prop_value

# if element is an img use source if exists
if prop_value := get_img_src_alt(el, base_url):
if prop_value := get_img(el, base_url):
return resolve_relative_url(prop_value)

# if element is an object use data if exists
Expand All @@ -136,7 +130,7 @@ def resolve_relative_url(prop_value):
# if a possible child was found parse
if poss_child is not None:
# img get src
if prop_value := get_img_src_alt(poss_child, base_url):
if prop_value := get_img(poss_child, base_url):
return resolve_relative_url(prop_value)

# object get data
Expand Down
8 changes: 6 additions & 2 deletions mf2py/parse_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import value_class_pattern
from .datetime_helpers import DATETIME_RE, TIME_RE, normalize_datetime
from .dom_helpers import get_attr, get_img_src_alt, get_textContent, try_urljoin
from .dom_helpers import get_attr, get_img, get_textContent, try_urljoin


def text(el, base_url=""):
Expand All @@ -31,7 +31,7 @@ def url(el, base_url=""):

prop_value = get_attr(el, "href", check_name=("a", "area", "link"))
if prop_value is None:
prop_value = get_img_src_alt(el, base_url)
prop_value = get_img(el, base_url)
if prop_value is not None:
return prop_value
if prop_value is None:
Expand Down Expand Up @@ -96,6 +96,10 @@ def datetime(el, default_date=None):

def embedded(el, root_lang, document_lang, base_url=""):
"""Process e-* properties"""
for tag in el.find_all():
for attr in ("href", "src", "cite", "data", "poster"):
if attr in tag.attrs:
tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr])
prop_value = {
"html": el.decode_contents().strip(), # secret bs4 method to get innerHTML
"value": get_textContent(el, replace_img=True, base_url=base_url),
Expand Down
77 changes: 77 additions & 0 deletions test/examples/img_with_srcset.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy-480w.jpg 480w,
elva-fairy-800w.jpg 800w"
sizes="(max-width: 600px) 480px,
800px"
src="elva-fairy-800w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy-320w.jpg,
elva-fairy-480w.jpg 1.5x,
elva-fairy-640w.jpg 2x,
elva-fairy-1.5w.jpg 1.5x,
elva-fairy-2w.jpg 2x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg, elva-fairy,480w.jpg 1.5x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg ,elva-fairy,480w.jpg 1.5x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg 1x,elva-fairy,480w.jpg 1.5x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg 1x ,elva-fairy,480w.jpg 1.5x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg 1x , elva-fairy,480w.jpg 1.5x"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy,320w.jpg"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>

<div class="h-entry">
<img class="u-photo"
srcset="
elva-fairy,320w.jpg,
elva-fairy,480w.jpg
1.5x,
elva-fairy,640w.jpg 2x ,
elva-fairy,1.5w.jpg 1.5x ,
elva-fairy,2w.jpg 2x
"
src="elva-fairy-320w.jpg"
alt="Elva dressed as a fairy">
</div>
11 changes: 11 additions & 0 deletions test/examples/img_with_srcset_with_base.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<base href="https://example.com">

<div class="h-entry">
<img class="u-photo"
srcset="elva-fairy-480w.jpg 480w,
elva-fairy-800w.jpg 800w"
sizes="(max-width: 600px) 480px,
800px"
src="elva-fairy-800w.jpg"
alt="Elva dressed as a fairy">
</div>
19 changes: 19 additions & 0 deletions test/examples/plaintext_img_whitespace.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<base href="https://example.com">

<div class="h-entry">
<div class="e-content">
<img src="/photo.jpg" alt="selfie">At some tourist spot
</div>
</div>

<div class="h-entry">
<div class="e-content">
<img src="/photo.jpg" alt="">At another tourist spot
</div>
</div>

<div class="h-entry">
<div class="e-content">
<img src="/photo.jpg">At yet another tourist spot
</div>
</div>
17 changes: 17 additions & 0 deletions test/examples/plaintext_p_whitespace.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<div class="h-entry">
<div class="e-content">
<p>foo</p><img src="pic.png" alt="bar">baz
</div>
</div>

<div class="h-entry">
<div class="e-content">
<p>foo</p>bar baz
</div>
</div>

<div class="h-entry">
<div class="e-content">
foo bar<p>baz</p>
</div>
</div>
13 changes: 13 additions & 0 deletions test/examples/relative_url_in_e.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>Relative URLs in e-content</title>
<base href="http://example.com/" />
</head>
<body>
<div class="h-entry">
<div class="e-content"><p><a href=/cat.html>Cat <img src=cat.jpg></a></p></div>
</div>
</body>
</html>
18 changes: 9 additions & 9 deletions test/examples/value_name_whitespace.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@
World</p></div>
</div>

<div class="h-entry">
<div class="e-content p-name"><p>Hello</p><p>World</p></div>
</div>

<div class="h-entry">
<div class="e-content p-name">Hello<br>
World</div>
Expand All @@ -36,11 +32,7 @@
</div>

<div class="h-entry">
<div class="e-content p-name">
<p>One</p>
<p>Two</p>
<p>Three</p>
</div>
<div class="e-content p-name"><p>Hello</p><p>World</p></div>
</div>

<div class="h-entry">
Expand All @@ -51,6 +43,14 @@
</div>
</div>

<div class="h-entry">
<div class="e-content p-name">
<p>One</p>
<p>Two</p>
<p>Three</p>
</div>
</div>

<div class="h-entry">
<div class="e-content p-name">
Hello World
Expand Down
Loading

0 comments on commit 1786227

Please sign in to comment.