Skip to content

Commit

Permalink
Add TSV to editor (#542)
Browse files Browse the repository at this point in the history
Add tab separated values to the Croissant editor.

Tab separated values are used for some datasets (e.g., FLORES 200). See
#541 for related issue.
  • Loading branch information
mkuchnik authored Feb 22, 2024
1 parent ba3a49d commit eaea2bc
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 3 deletions.
24 changes: 22 additions & 2 deletions editor/core/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@ class FileTypes:
encoding_format="application/x-tar",
extensions=["tar"],
)
TSV = FileType(
name="TSV", encoding_format="text/tab-separated-values", extensions=["tsv"]
)
TXT = FileType(
name="Text",
encoding_format="plain/text",
encoding_format="text/plain",
extensions=["txt"],
)
ZIP = FileType(
Expand All @@ -79,6 +82,7 @@ def _full_name(file_type: FileType):
FileTypes.JSONL,
FileTypes.PARQUET,
FileTypes.TAR,
FileTypes.TSV,
FileTypes.TXT,
FileTypes.ZIP,
]
Expand Down Expand Up @@ -141,6 +145,8 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
df = pd.read_json(file, lines=True)
elif file_type == FileTypes.PARQUET:
df = pd.read_parquet(file)
elif file_type == FileTypes.TSV:
df = pd.read_csv(file, sep="\t")
else:
raise NotImplementedError(
f"File type {file_type} is not supported. Please, open an issue on GitHub:"
Expand All @@ -149,8 +155,22 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
return df.infer_objects()


def guess_file_type(path: epath.Path) -> FileType | None:
def _guess_mime_type(path: epath.Path) -> str:
"""Guess most specific MIME type."""
mime = magic.from_file(path, mime=True)
extension = path.suffix
if mime == "text/plain":
# In some cases, a CSV/TSV may be classified as text
# For example, if the file is not terminated by a newline
if extension == ".csv":
mime = "text/csv"
elif extension == ".tsv":
mime = "text/tab-separated-values"
return mime


def guess_file_type(path: epath.Path) -> FileType | None:
mime = _guess_mime_type(path)
return ENCODING_FORMATS.get(mime)


Expand Down
48 changes: 47 additions & 1 deletion editor/core/files_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@


@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
def test_check_file_csv(guess_file_type):
def test_check_file_csv_url(guess_file_type):
del guess_file_type
csv = epath.Path(
# This is the hash path for "https://my.url".
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
)
# Test unescaped CSV
if csv.exists():
csv.unlink()
with csv.open("w") as f:
Expand All @@ -28,6 +29,51 @@ def test_check_file_csv(guess_file_type):
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
)

# Test error thrown on no file
csv.unlink()
with pytest.raises(Exception):
files_module.file_from_url("https://my.url", set(), epath.Path())

# Test escaped CSV
content = b'"This","Is"\n1,2\n3,4'
with csv.open("wb") as f:
f.write(content)
file = files_module.file_from_url("https://my.url", set(), epath.Path())
pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))


@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.TSV)
def test_check_file_tsv_url(guess_file_type):
del guess_file_type
tsv = epath.Path(
# This is the hash path for "https://my.url".
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
)
# Test unescaped CSV
if tsv.exists():
tsv.unlink()
with tsv.open("w") as f:
f.write("column1\tcolumn2\n")
f.write("a\t1\n")
f.write("b\t2\n")
f.write("c\t3\n")
file = files_module.file_from_url("https://my.url", set(), epath.Path())
pd.testing.assert_frame_equal(
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
)

# Test error thrown on no file
tsv.unlink()
with pytest.raises(Exception):
files_module.file_from_url("https://my.url", set(), epath.Path())

# Test escaped TSV
content = b'"This"\t"Is"\n1\t2\n3\t4'
with tsv.open("wb") as f:
f.write(content)
file = files_module.file_from_url("https://my.url", set(), epath.Path())
pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))


@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
def test_check_file_unknown(guess_file_type):
Expand Down

0 comments on commit eaea2bc

Please sign in to comment.