Skip to content

Commit

Permalink
Allow multiple values in manifest TSV
Browse files Browse the repository at this point in the history
  • Loading branch information
BryanFauble committed Dec 20, 2023
1 parent 41a3919 commit 91881be
Show file tree
Hide file tree
Showing 6 changed files with 338 additions and 32 deletions.
24 changes: 17 additions & 7 deletions docs/explanations/manifest_tsv.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ See:

Any columns that are not in the reserved names described above will be interpreted as annotations of the file

For example this is adding 2 annotations to each row:
Annotations can be semi-colon (";") separated lists. If you leave a space, like "aaaa; bbbb" the white space from " bbbb" will be stripped.

| path | parent | annot1 | annot2 |
| --- | --- | --- | --- |
| /path/file1.txt | syn1243 | "bar" | 3.1415 |
| /path/file2.txt | syn12433 | "baz" | 2.71 |
| /path/file3.txt | syn12455 | "zzz" | 3.52 |
For example this is adding 4 annotations to each row:

| path | parent | annot1 | annot2 | annot3 | annot4 |
| --- | --- | --- | --- | --- | --- |
| /path/file1.txt | syn1243 | "bar" | 3.1415 | "aaaa; bbbb" | 14;27;30 |
| /path/file2.txt | syn12433 | "baz" | 2.71 | "value_1;value_2" | 1;2;3 |
| /path/file3.txt | syn12455 | "zzz" | 3.52 | "value_3;value_4" | 42; 56; 77 |

See:

Expand All @@ -73,6 +75,14 @@ See:
| /path/file2.txt | syn12433 | "baz" | 2.71 | 2001-01-01 15:00:00+07:00 | "" | "https://github.org/foo/baz" |
| /path/file3.txt | syn12455 | "zzz" | 3.52 | 2023-12-04T07:00:00Z | "" | "https://github.org/foo/zzz" |

## See:
### Dates in the manifest file
Dates within the manifest file will always be written as [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format in UTC without milliseconds. For example: `2023-12-20T16:55:08Z`.

Dates can be written in other formats specified in ISO 8601 and it will be reconginzed, however, the [synapseutils.syncFromSynapse][] will always write this in the UTC format specified above. For example you may want to specify a datetime at a specific timezone like: `2023-12-20 23:55:08-07:00` and this will be recognized as a valid datetime.


## Refernces:

- [synapseutils.syncFromSynapse][]
- [synapseutils.syncToSynapse][]
- [Managing custom metadata at scale](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html#ManagingCustomMetadataatScale-BatchUploadFileswithAnnotations)
12 changes: 11 additions & 1 deletion synapseclient/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
entity.lat_long = [47.627477, -122.332154]
```
Record when we collected the data. This will use the current timezone of the machine running the code.
Record when we collected the data. **This will use the current timezone of the machine
running the code.**
```python
from datetime import datetime as Datetime
Expand All @@ -35,6 +36,15 @@
entity.collection_date = Datetime.utcnow()
```
You may also use a Timezone aware datetime object like the following example. Using the
[pytz library](https://pypi.org/project/pytz/) is recommended for this purpose.:
```python
from datetime import datetime as Datetime, timezone as Timezone, timedelta as Timedelta
date = Datetime(2023, 12, 20, 8, 10, 0, tzinfo=Timezone(Timedelta(hours=-5)))
```
See:
- [synapseclient.Synapse.get_annotations][]
Expand Down
67 changes: 63 additions & 4 deletions synapseclient/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,56 @@ def is_synapse_id_str(obj):
return None


def bool_or_none(input_value: str) -> typing.Union[bool, None]:
"""
Attempts to convert a string to a bool. Returns None if it fails.
Args:
input_value: The string to convert to a bool
Returns:
The bool or None if the conversion fails
"""
if input_value == "True" or input_value == "true":
return True
elif input_value == "False" or input_value == "false":
return False
else:
return None


def int_or_none(input_value: str) -> typing.Union[int, None]:
"""
Attempts to convert a string to an int. Returns None if it fails.
Args:
input_value: The string to convert to an int
Returns:
The int or None if the conversion fails
"""
try:
return int(input_value)
except ValueError:
return None


def float_or_none(input_value: str) -> typing.Union[float, None]:
"""
Attempts to convert a string to a float. Returns None if it fails.
Args:
input_value: The string to convert to a float
Returns:
The float or None if the conversion fails
"""
try:
return float(input_value)
except ValueError:
return None


def datetime_or_none(datetime_str: str) -> typing.Union[datetime.datetime, None]:
"""Attempts to convert a string to a datetime object. Returns None if it fails.
Expand Down Expand Up @@ -501,20 +551,29 @@ def from_unix_epoch_time(ms) -> datetime.datetime:
return from_unix_epoch_time_secs(ms / 1000.0)


def datetime_to_iso(dt, sep="T"):
def datetime_to_iso(dt, sep="T", include_milliseconds=True) -> str:
# Round microseconds to milliseconds (as expected by older clients)
# and add back the "Z" at the end.
# see: http://stackoverflow.com/questions/30266188/how-to-convert-date-string-to-iso8601-standard
fmt = (
"{time.year:04}-{time.month:02}-{time.day:02}"
"{sep}{time.hour:02}:{time.minute:02}:{time.second:02}.{millisecond:03}{tz}"
)
fmt_no_mills = (
"{time.year:04}-{time.month:02}-{time.day:02}"
"{sep}{time.hour:02}:{time.minute:02}:{time.second:02}{tz}"
)
if dt.microsecond >= 999500:
dt -= datetime.timedelta(microseconds=dt.microsecond)
dt += datetime.timedelta(seconds=1)
return fmt.format(
time=dt, millisecond=int(round(dt.microsecond / 1000.0)), tz="Z", sep=sep
)
if include_milliseconds:
return fmt.format(
time=dt, millisecond=int(round(dt.microsecond / 1000.0)), tz="Z", sep=sep
)
else:
return fmt_no_mills.format(
time=dt, millisecond=int(round(dt.microsecond / 1000.0)), tz="Z", sep=sep
)


def iso_to_datetime(iso_time):
Expand Down
174 changes: 162 additions & 12 deletions synapseutils/sync.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import concurrent.futures
from contextlib import contextmanager
import datetime
import io
import os
import re
Expand All @@ -11,8 +12,16 @@
from .monitor import notifyMe
from synapseclient.entity import is_container
from synapseclient.core import config
from synapseclient.core.utils import id_of, is_url, is_synapse_id_str, datetime_or_none
from synapseclient import File, table
from synapseclient.core.utils import (
id_of,
is_url,
is_synapse_id_str,
datetime_or_none,
float_or_none,
int_or_none,
bool_or_none,
)
from synapseclient import File, table, Synapse
from synapseclient.core.pool_provider import SingleThreadExecutor
from synapseclient.core import utils
from synapseclient.core.cumulative_transfer_progress import CumulativeTransferProgress
Expand Down Expand Up @@ -82,6 +91,25 @@ def syncFromSynapse(
"""Synchronizes all the files in a folder (including subfolders) from Synapse and adds a readme manifest with file
metadata.
There are a few conversions around annotations to call out here.
## Conversion of objects from the REST API to Python native objects
The first annotation conversion is to take the annotations from the REST API and
convert them into Python native objects. For example the REST API will return a
milliseconds since epoch timestamp for a datetime annotation, however, we want to
convert that into a Python datetime object. These conversions take place in the
[annotations module][synapseclient.annotations].
## Conversion of Python native objects into strings
The second annotation conversion occurs when we are writing to the manifest TSV file.
In this case we need to convert the Python native objects into strings that can be
written to the manifest file. In addition we also need to handle the case where the
annotation value is a list of objects. In this case we are converting the list
into a single cell of data with a semicolon ";" delimiter.
Arguments:
syn: A Synapse object with user's login, e.g. syn = synapseclient.login()
entity: A Synapse ID, a Synapse Entity object of type file, folder or project.
Expand Down Expand Up @@ -752,7 +780,7 @@ def _extract_file_entity_metadata(syn, allFiles, *, provenance_cache=None):
}
row.update(
{
key: (val[0] if len(val) > 0 else "")
key: (val if len(val) > 0 else "")
for key, val in entity.annotations.items()
}
)
Expand Down Expand Up @@ -796,14 +824,78 @@ def _get_file_entity_provenance_dict(syn, entity):
raise # unexpected error so we re-raise the exception


def _write_manifest_data(filename, keys, data):
def _convert_manifest_data_items_to_string_list(
items: typing.List[str],
) -> typing.List[str]:
"""
Handle coverting an individual key that contains a list of data into a list of strings
that can be written to the manifest file. This has specific logic around how to
handle datetime fields and non strings.
When working with datetime fields we are printing the ISO 8601 UTC representation of
the datetime.
When working with non strings we are printing the repr of the object.
Args:
items: The list of items to convert.
Returns:
The list of items converted to strings.
"""
items_to_write = []
for item in items:
if isinstance(item, datetime.datetime):
items_to_write.append(
utils.datetime_to_iso(dt=item, include_milliseconds=False)
)
else:
items_to_write.append(item if isinstance(item, str) else repr(item))
return items_to_write


def _convert_manifest_data_row_to_dict(row: dict, keys: typing.List[str]) -> str:
"""
Convert a row of data to a dict that can be written to a manifest file.
Args:
row: The row of data to convert.
keys: The keys of the manifest. Used to select the rows of data.
Returns:
The dict representation of the row.
"""
data_to_write = {}
for key in keys:
data_for_key = row.get(key, "")
if isinstance(data_for_key, list):
items_to_write = _convert_manifest_data_items_to_string_list(data_for_key)
data_to_write[key] = ";".join(items_to_write)
else:
data_to_write[key] = data_for_key
return data_to_write


def _write_manifest_data(
filename: str, keys: typing.List[str], data: typing.List[dict]
) -> None:
"""
Write a number of keys and a list of data to a manifest file. This will write
the data out as a tab separated file.
Args:
filename: The name of the file to write to.
keys: The keys of the manifest.
data: The data to write to the manifest. This should be a list of dicts where
each dict represents a row of data.
"""
with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp:
csv_writer = csv.DictWriter(
fp, keys, restval="", extrasaction="ignore", delimiter="\t"
)
csv_writer.writeheader()
for row in data:
csv_writer.writerow(row)
csv_writer.writerow(rowdict=_convert_manifest_data_row_to_dict(row, keys))


def _sortAndFixProvenance(syn, df):
Expand Down Expand Up @@ -985,6 +1077,23 @@ def syncToSynapse(
[Read more about the manifest file format](../../explanations/manifest_tsv/)
There are a few conversions around annotations to call out here.
## Conversion of annotations from the TSV file to Python native objects
The first annotation conversion is from the TSV file into a Python native object. For
example Pandas will read a TSV file and convert the string "True" into a boolean True,
however, Pandas will NOT convert our semi-colon delimited list of annotations into
their Python native objects. This means that we need to do that conversion here after
splitting them apart.
## Conversion of Python native objects for the REST API
The second annotation conversion occurs when we are taking the Python native objects
and converting them into a string that can be sent to the REST API. For example
the datetime objects which may have timezone information are converted to milliseconds
since epoch.
Arguments:
syn: A Synapse object with user's login, e.g. syn = synapseclient.login()
manifestFile: A tsv file with file locations and metadata to be pushed to Synapse.
Expand Down Expand Up @@ -1021,9 +1130,50 @@ def syncToSynapse(
_manifest_upload(syn, df)


def _manifest_upload(syn, df):
def _convert_cell_in_manifest_to_python_types(
cell: str,
) -> typing.Union[typing.List, datetime.datetime, float, int, bool, str]:
"""
Takes a possibly semi-colon delimited cell from the manifest TSV file into a list
of items to be used as annotations.
Args:
cell: The cell item to convert.
Returns:
The list of items to be used as annotations. Or a single instance if that is
all that is present.
"""
values_to_return = []

cell_values = cell.split(";")
for annotation_value in cell_values:
if possible_datetime := datetime_or_none(annotation_value):
values_to_return.append(possible_datetime)
elif (possible_int := int_or_none(annotation_value)) is not None:
values_to_return.append(possible_int)
elif (possible_float := float_or_none(annotation_value)) is not None:
values_to_return.append(possible_float)
elif (possible_bool := bool_or_none(annotation_value)) is not None:
values_to_return.append(possible_bool)
else:
values_to_return.append(annotation_value)
return values_to_return[0] if len(values_to_return) == 1 else values_to_return


def _manifest_upload(syn: Synapse, df) -> bool:
"""
Handles the upload of the manifest file.
Args:
syn: The logged in Synapse client.
df: The dataframe of the manifest file.
Returns:
If the manifest upload was successful.
"""
items = []
for i, row in df.iterrows():
for _, row in df.iterrows():
file = File(
path=row["path"],
parent=row["parent"],
Expand All @@ -1047,12 +1197,12 @@ def _manifest_upload(syn, df):
for annotation_key, annotation_value in annotations.items():
if annotation_value is None or annotation_value == "":
continue
possible_datetime = None
if isinstance(annotation_value, str):
possible_datetime = datetime_or_none(annotation_value)
file_annotations[annotation_key] = (
annotation_value if possible_datetime is None else possible_datetime
)
file_annotations[
annotation_key
] = _convert_cell_in_manifest_to_python_types(cell=annotation_value)
else:
file_annotations[annotation_key] = annotation_value
file.annotations = file_annotations

item = _SyncUploadItem(
Expand Down
Loading

0 comments on commit 91881be

Please sign in to comment.