Skip to content

Commit

Permalink
Fix non-character handling in string.utf_codepoint
Browse files Browse the repository at this point in the history
Treats `U+FFFE` and `U+FFFF` as valid Unicode codepoints rather than errors.  See #778.
  • Loading branch information
mooreryan authored and lpil committed Jan 3, 2025
1 parent c5d0ede commit 6f44f83
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
- The deprecated `function.compose`, `function.constant`, `function.apply*`,
`function.curry*`, `result.nil_error`, `list.concat`, `bool.compare`, and
`bool.to_int` functions have been removed.
- Fixed a bug where `string.utf_codepoint` would treat valid Unicode codepoints
`U+FFFE` and `U+FFFF` as invalid.

## v0.51.0 - 2024-12-22

Expand Down
1 change: 0 additions & 1 deletion src/gleam/string.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,6 @@ pub fn from_utf_codepoints(utf_codepoints: List(UtfCodepoint)) -> String
pub fn utf_codepoint(value: Int) -> Result(UtfCodepoint, Nil) {
case value {
i if i > 1_114_111 -> Error(Nil)
65_534 | 65_535 -> Error(Nil)
i if i >= 55_296 && i <= 57_343 -> Error(Nil)
i if i < 0 -> Error(Nil)
i -> Ok(unsafe_int_to_utf_codepoint(i))
Expand Down
32 changes: 29 additions & 3 deletions test/gleam/string_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -702,17 +702,43 @@ pub fn from_utf_codepoints_test() {
}

pub fn utf_codepoint_test() {
string.utf_codepoint(1_114_444)
// Less than the lower bound on valid codepoints
string.utf_codepoint(-1)
|> should.be_error

string.utf_codepoint(65_534)
// The lower bound on valid codepoints
string.utf_codepoint(0)
|> should.be_ok

// The upper bound for valid code points
string.utf_codepoint(1_114_111)
|> should.be_ok

// Greater than the upper bound on valid codepoints
string.utf_codepoint(1_114_112)
|> should.be_error

// Non-characters U+FFFE and U+FFFF are valid codepoints. See (#778).
string.utf_codepoint(65_534)
|> should.be_ok
string.utf_codepoint(65_535)
|> should.be_ok

// One less than the lowest "High-surrogate code point"
string.utf_codepoint(55_295)
|> should.be_ok

// Lowest value of the "High-surrogate code point" (U+D800 to U+DBFF)
string.utf_codepoint(55_296)
|> should.be_error

string.utf_codepoint(-1)
// Highest value of the "Low-surrogate code point" (U+DC00 to U+DFFF)
string.utf_codepoint(57_343)
|> should.be_error

// One greater than the highest "Low-surrogate code point"
string.utf_codepoint(57_344)
|> should.be_ok
}

pub fn bit_array_utf_codepoint_test() {
Expand Down

0 comments on commit 6f44f83

Please sign in to comment.