Fix non-character handling in string.utf_codepoint

Treats `U+FFFE` and `U+FFFF` as valid Unicode codepoints rather than errors. See #778.
gleam-lang · Jan 3, 2025 · 6f44f83 · 6f44f83
1 parent c5d0ede
commit 6f44f83
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@
 - The deprecated `function.compose`, `function.constant`, `function.apply*`,
   `function.curry*`, `result.nil_error`, `list.concat`, `bool.compare`, and
   `bool.to_int` functions have been removed.
+- Fixed a bug where `string.utf_codepoint` would treat valid Unicode codepoints
+  `U+FFFE` and `U+FFFF` as invalid.
 
 ## v0.51.0 - 2024-12-22
 

diff --git a/src/gleam/string.gleam b/src/gleam/string.gleam
@@ -809,7 +809,6 @@ pub fn from_utf_codepoints(utf_codepoints: List(UtfCodepoint)) -> String
 pub fn utf_codepoint(value: Int) -> Result(UtfCodepoint, Nil) {
   case value {
     i if i > 1_114_111 -> Error(Nil)
-    65_534 | 65_535 -> Error(Nil)
     i if i >= 55_296 && i <= 57_343 -> Error(Nil)
     i if i < 0 -> Error(Nil)
     i -> Ok(unsafe_int_to_utf_codepoint(i))

diff --git a/test/gleam/string_test.gleam b/test/gleam/string_test.gleam
@@ -702,17 +702,43 @@ pub fn from_utf_codepoints_test() {
 }
 
 pub fn utf_codepoint_test() {
-  string.utf_codepoint(1_114_444)
+  // Less than the lower bound on valid codepoints
+  string.utf_codepoint(-1)
   |> should.be_error
 
-  string.utf_codepoint(65_534)
+  // The lower bound on valid codepoints
+  string.utf_codepoint(0)
+  |> should.be_ok
+
+  // The upper bound for valid code points
+  string.utf_codepoint(1_114_111)
+  |> should.be_ok
+
+  // Greater than the upper bound on valid codepoints
+  string.utf_codepoint(1_114_112)
   |> should.be_error
 
+  // Non-characters U+FFFE and U+FFFF are valid codepoints.  See (#778).
+  string.utf_codepoint(65_534)
+  |> should.be_ok
+  string.utf_codepoint(65_535)
+  |> should.be_ok
+
+  // One less than the lowest "High-surrogate code point" 
+  string.utf_codepoint(55_295)
+  |> should.be_ok
+
+  // Lowest value of the "High-surrogate code point" (U+D800 to U+DBFF)
   string.utf_codepoint(55_296)
   |> should.be_error
 
-  string.utf_codepoint(-1)
+  // Highest value of the "Low-surrogate code point" (U+DC00 to U+DFFF)
+  string.utf_codepoint(57_343)
   |> should.be_error
+
+  // One greater than the highest "Low-surrogate code point"
+  string.utf_codepoint(57_344)
+  |> should.be_ok
 }
 
 pub fn bit_array_utf_codepoint_test() {