-
Notifications
You must be signed in to change notification settings - Fork 73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: Implement msgspec encoding #2541
base: main
Are you sure you want to change the base?
Changes from all commits
913bc83
e09bd59
cbe10bd
8d32686
e04f827
8aefade
f691f78
6caf27d
08b58bf
3169b58
9e46c44
66f0d4e
2f91bc1
07f348e
8c748a1
815f056
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from __future__ import annotations | ||
|
||
import datetime | ||
import decimal | ||
import logging | ||
import sys | ||
import typing as t | ||
|
||
import msgspec | ||
|
||
from singer_sdk._singerlib.exceptions import InvalidInputLine | ||
|
||
from ._base import GenericSingerReader, GenericSingerWriter | ||
from ._simple import Message | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def enc_hook(obj: t.Any) -> t.Any: # noqa: ANN401 | ||
"""Encoding type helper for non native types. | ||
|
||
Args: | ||
obj: the item to be encoded | ||
|
||
Returns: | ||
The object converted to the appropriate type, default is str | ||
""" | ||
return obj.isoformat(sep="T") if isinstance(obj, datetime.datetime) else str(obj) | ||
|
||
|
||
def dec_hook(type: type, obj: t.Any) -> t.Any: # noqa: ARG001, A002, ANN401 | ||
"""Decoding type helper for non native types. | ||
|
||
Args: | ||
type: the type given | ||
obj: the item to be decoded | ||
|
||
Returns: | ||
The object converted to the appropriate type, default is str. | ||
""" | ||
return str(obj) | ||
|
||
|
||
encoder = msgspec.json.Encoder(enc_hook=enc_hook, decimal_format="number") | ||
decoder = msgspec.json.Decoder(dec_hook=dec_hook, float_hook=decimal.Decimal) | ||
_jsonl_msg_buffer = bytearray(64) | ||
|
||
|
||
def serialize_jsonl(obj: object, **kwargs: t.Any) -> bytes: # noqa: ARG001 | ||
"""Serialize a dictionary into a line of jsonl. | ||
|
||
Args: | ||
obj: A Python object usually a dict. | ||
**kwargs: Optional key word arguments. | ||
|
||
Returns: | ||
A bytes of serialized json. | ||
""" | ||
encoder.encode_into(obj, _jsonl_msg_buffer) | ||
_jsonl_msg_buffer.extend(b"\n") | ||
return _jsonl_msg_buffer | ||
|
||
|
||
class MsgSpecReader(GenericSingerReader[str]): | ||
"""Base class for all plugins reading Singer messages as strings from stdin.""" | ||
|
||
default_input = sys.stdin | ||
|
||
def deserialize_json(self, line: str) -> dict: # noqa: PLR6301 | ||
"""Deserialize a line of json. | ||
|
||
Args: | ||
line: A single line of json. | ||
|
||
Returns: | ||
A dictionary of the deserialized json. | ||
|
||
Raises: | ||
InvalidInputLine: If the line cannot be parsed | ||
""" | ||
try: | ||
return decoder.decode(line) # type: ignore[no-any-return] | ||
except msgspec.DecodeError as exc: | ||
logger.exception("Unable to parse:\n%s", line) | ||
msg = f"Unable to parse line as JSON: {line}" | ||
raise InvalidInputLine(msg) from exc | ||
|
||
|
||
class MsgSpecWriter(GenericSingerWriter[bytes, Message]): | ||
"""Interface for all plugins writing Singer messages to stdout.""" | ||
|
||
def serialize_message(self, message: Message) -> bytes: # noqa: PLR6301 | ||
"""Serialize a dictionary into a line of json. | ||
|
||
Args: | ||
message: A Singer message object. | ||
|
||
Returns: | ||
A string of serialized json. | ||
""" | ||
return serialize_jsonl(message.to_dict()) | ||
|
||
def write_message(self, message: Message) -> None: | ||
"""Write a message to stdout. | ||
|
||
Args: | ||
message: The message to write. | ||
""" | ||
sys.stdout.buffer.write(self.format_message(message)) | ||
sys.stdout.flush() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,7 +111,12 @@ def get_standard_target_tests( | |
return [] | ||
|
||
|
||
def tap_sync_test(tap: Tap) -> tuple[io.StringIO, io.StringIO]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (complexity): Consider creating a type alias to simplify the function signatures. The type annotations have become overly verbose while not adding proportional value. Create a type alias to simplify the signatures while maintaining type safety: # At the top of the file
SingerIO = io.TextIOWrapper[io.BytesIO]
# Then simplify function signatures like:
def tap_sync_test(
tap: Tap,
) -> tuple[SingerIO, SingerIO]:
stdout_buf = SingerIO(io.BytesIO(), encoding="utf-8")
...
def tap_to_target_sync_test(
tap: Tap,
target: Target,
) -> tuple[SingerIO, SingerIO, SingerIO, SingerIO]:
... This maintains the same type safety while improving readability and making future type changes easier to maintain. |
||
def tap_sync_test( | ||
tap: Tap, | ||
) -> tuple[ | ||
io.TextIOWrapper[io.BytesIO], | ||
io.TextIOWrapper[io.BytesIO], | ||
]: | ||
"""Invokes a Tap object and return STDOUT and STDERR results in StringIO buffers. | ||
|
||
Args: | ||
|
@@ -120,8 +125,8 @@ def tap_sync_test(tap: Tap) -> tuple[io.StringIO, io.StringIO]: | |
Returns: | ||
A 2-item tuple with StringIO buffers from the Tap's output: (stdout, stderr) | ||
""" | ||
stdout_buf = io.StringIO() | ||
stderr_buf = io.StringIO() | ||
stdout_buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
stderr_buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): | ||
tap.sync_all() | ||
stdout_buf.seek(0) | ||
|
@@ -171,10 +176,10 @@ def _select_all(catalog_dict: dict) -> dict: | |
|
||
def target_sync_test( | ||
target: Target, | ||
input: io.StringIO | None, # noqa: A002 | ||
input: t.IO[str] | None, # noqa: A002 | ||
*, | ||
finalize: bool = True, | ||
) -> tuple[io.StringIO, io.StringIO]: | ||
) -> tuple[io.TextIOWrapper[io.BytesIO], io.TextIOWrapper[io.BytesIO]]: | ||
"""Invoke the target with the provided input. | ||
|
||
Args: | ||
|
@@ -186,8 +191,8 @@ def target_sync_test( | |
Returns: | ||
A 2-item tuple with StringIO buffers from the Target's output: (stdout, stderr) | ||
""" | ||
stdout_buf = io.StringIO() | ||
stderr_buf = io.StringIO() | ||
stdout_buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
stderr_buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
|
||
with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): | ||
if input is not None: | ||
|
@@ -203,7 +208,12 @@ def target_sync_test( | |
def tap_to_target_sync_test( | ||
tap: Tap, | ||
target: Target, | ||
) -> tuple[io.StringIO, io.StringIO, io.StringIO, io.StringIO]: | ||
) -> tuple[ | ||
io.TextIOWrapper[io.BytesIO], | ||
io.TextIOWrapper[io.BytesIO], | ||
io.TextIOWrapper[io.BytesIO], | ||
io.TextIOWrapper[io.BytesIO], | ||
]: | ||
"""Test and end-to-end sink from the tap to the target. | ||
|
||
Note: This method buffers all output from the tap in memory and should not be | ||
|
@@ -236,15 +246,15 @@ def sync_end_to_end(tap: Tap, target: Target, *mappers: InlineMapper) -> None: | |
mappers: Zero or more inline mapper to apply in between the tap and target, in | ||
order. | ||
""" | ||
buf = io.StringIO() | ||
buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
with redirect_stdout(buf): | ||
tap.sync_all() | ||
|
||
buf.seek(0) | ||
mapper_output = buf | ||
|
||
for mapper in mappers: | ||
buf = io.StringIO() | ||
buf = io.TextIOWrapper(io.BytesIO(), encoding="utf-8") | ||
with redirect_stdout(buf): | ||
mapper.listen(mapper_output) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from __future__ import annotations |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🚨 issue (security): The global bytearray buffer could cause thread-safety issues and the fixed size might be insufficient for larger messages
Consider using a thread-local buffer or creating a new buffer for each message. Also, the buffer size should either be dynamic or much larger to handle varying message sizes safely.