diff --git a/api/python/quilt3/packages.py b/api/python/quilt3/packages.py index 5bb913520d5..a9f5394b70d 100644 --- a/api/python/quilt3/packages.py +++ b/api/python/quilt3/packages.py @@ -34,6 +34,7 @@ legacy_calculate_checksum, legacy_calculate_checksum_bytes, list_object_versions, + list_objects, list_url, put_bytes, ) @@ -841,7 +842,7 @@ def _load(cls, readable_file): gc.enable() return pkg - def set_dir(self, lkey, path=None, meta=None, update_policy="incoming"): + def set_dir(self, lkey, path=None, meta=None, update_policy="incoming", unversioned: bool = False): """ Adds all files from `path` to the package. @@ -858,6 +859,7 @@ def set_dir(self, lkey, path=None, meta=None, update_policy="incoming"): If 'incoming', whenever logical keys match, always take the new entry from set_dir. If 'existing', whenever logical keys match, retain existing entries and ignore new entries from set_dir. + unversioned(bool): when True, do not retrieve VersionId for S3 physical keys. Returns: self @@ -910,10 +912,13 @@ def set_dir(self, lkey, path=None, meta=None, update_policy="incoming"): src_path = src.path if src.basename() != '': src_path += '/' - objects, _ = list_object_versions(src.bucket, src_path) + if not unversioned: + objects, _ = list_object_versions(src.bucket, src_path) + objects = filter(lambda obj: obj["IsLatest"], objects) + else: + objects = list_objects(src.bucket, src_path, recursive=True) + for obj in objects: - if not obj['IsLatest']: - continue # Skip S3 pseduo directory files and Keys that end in / if obj['Key'].endswith('/'): if obj['Size'] != 0: @@ -1134,7 +1139,15 @@ def manifest(self): for logical_key, entry in self.walk(): yield {'logical_key': logical_key, **entry.as_dict()} - def set(self, logical_key, entry=None, meta=None, serialization_location=None, serialization_format_opts=None): + def set( + self, + logical_key, + entry=None, + meta=None, + serialization_location=None, + serialization_format_opts=None, + unversioned: bool = False, + ): """ Returns self with the object at logical_key set to entry. @@ -1154,6 +1167,7 @@ def set(self, logical_key, entry=None, meta=None, serialization_location=None, s https://github.com/quiltdata/quilt/blob/master/api/python/quilt3/formats.py serialization_location(string): Optional. If passed in, only used if entry is an object. Where the serialized object should be written, e.g. "./mydataframe.parquet" + unversioned(bool): when True, do not retrieve VersionId for S3 physical keys. Returns: self @@ -1162,9 +1176,18 @@ def set(self, logical_key, entry=None, meta=None, serialization_location=None, s entry=entry, meta=meta, serialization_location=serialization_location, - serialization_format_opts=serialization_format_opts) + serialization_format_opts=serialization_format_opts, + unversioned=unversioned) - def _set(self, logical_key, entry=None, meta=None, serialization_location=None, serialization_format_opts=None): + def _set( + self, + logical_key, + entry=None, + meta=None, + serialization_location=None, + serialization_format_opts=None, + unversioned: bool = False, + ): if not logical_key or logical_key.endswith('/'): raise QuiltException( f"Invalid logical key {logical_key!r}. " @@ -1181,7 +1204,7 @@ def _set(self, logical_key, entry=None, meta=None, serialization_location=None, size, version_id = get_size_and_version(src) # Determine if a new version needs to be appended. - if not src.is_local() and src.version_id is None and version_id is not None: + if not src.is_local() and src.version_id is None and version_id is not None and not unversioned: src.version_id = version_id entry = PackageEntry(src, size, None, None) elif isinstance(entry, PackageEntry): diff --git a/api/python/tests/integration/test_packages.py b/api/python/tests/integration/test_packages.py index b00926df154..84bcb1fd1a2 100644 --- a/api/python/tests/integration/test_packages.py +++ b/api/python/tests/integration/test_packages.py @@ -631,6 +631,25 @@ def test_set_dir_wrong_update_policy(self): pkg.set_dir("nested", DATA_DIR, update_policy='invalid_policy') assert expected_err in str(e.value) + @mock.patch("quilt3.packages.list_objects") + @mock.patch("quilt3.packages.list_object_versions") + def test_set_dir_unversioned(self, list_object_versions_mock, list_objects_mock): + list_objects_mock.return_value = [ + { + "Key": "foo/bar.txt", + "Size": 123, + }, + ] + + pkg = Package().set_dir(".", "s3://bucket/foo", unversioned=True) + + list_object_versions_mock.assert_not_called() + list_objects_mock.assert_called_once_with("bucket", "foo/", recursive=True) + assert [ + (lk, e.get()) + for lk, e in pkg.walk() + ] == [("bar.txt", "s3://bucket/foo/bar.txt")] + def test_package_entry_meta(self): pkg = ( Package() @@ -746,6 +765,17 @@ def test_set_package_entry_as_object(self): file_path = pkg[lk].physical_key.path assert not pathlib.Path(file_path).exists(), "These temp files should have been deleted during push()" + @patch("quilt3.packages.get_size_and_version", mock.Mock(return_value=(123, "v1"))) + def test_set_package_entry_unversioned_flag(self): + for flag_value, version_id in { + True: None, + False: "v1", + }.items(): + with self.subTest(flag_value=flag_value, version_id=version_id): + pkg = Package() + pkg.set("bar", "s3://bucket/bar", unversioned=flag_value) + assert pkg["bar"].physical_key == PhysicalKey("bucket", "bar", version_id) + def test_tophash_changes(self): test_file = Path('test.txt') test_file.write_text('asdf', 'utf-8') diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 83cce4182cf..94b275be1dc 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -17,6 +17,7 @@ Entries inside each section should be ordered by type: # unreleased - YYYY-MM-DD ## Python API +* [Added] New 'unversioned' parameter to `Package.set_dir()` and `Package.set()` for use with S3 URIs, such as HealthOmics, that do not support `ListBucketVersions` and/or `GetObjectVersion` ([#3927](https://github.com/quiltdata/quilt/pull/3927)) * [Removed] Drop Python 3.7 support ([#3841](https://github.com/quiltdata/quilt/pull/3841)) * [Changed] Set S3 client `max_pool_connections` to `QUILT_TRANSFER_MAX_CONCURRENCY` ([#3867](https://github.com/quiltdata/quilt/pull/3867)) * [Changed] **BREAKING:** Switch from a regular SHA256 checksum to a hash list (`sha2-256-chunked`) to match S3's built-in checksums ([#2782](https://github.com/quiltdata/quilt/pull/2782)) diff --git a/docs/api-reference/Package.md b/docs/api-reference/Package.md index c9c87ce4907..f3b023e165d 100644 --- a/docs/api-reference/Package.md +++ b/docs/api-reference/Package.md @@ -131,7 +131,7 @@ json decode error invalid package exception -## Package.set\_dir(self, lkey, path=None, meta=None, update\_policy='incoming') {#Package.set\_dir} +## Package.set\_dir(self, lkey, path=None, meta=None, update\_policy='incoming', unversioned: bool = False) {#Package.set\_dir} Adds all files from `path` to the package. @@ -149,6 +149,7 @@ __Arguments__ If 'incoming', whenever logical keys match, always take the new entry from set_dir. If 'existing', whenever logical keys match, retain existing entries and ignore new entries from set_dir. +* __unversioned(bool)__: when True, do not retrieve VersionId for S3 physical keys. __Returns__ @@ -230,7 +231,7 @@ fail to create file fail to finish write -## Package.set(self, logical\_key, entry=None, meta=None, serialization\_location=None, serialization\_format\_opts=None) {#Package.set} +## Package.set(self, logical\_key, entry=None, meta=None, serialization\_location=None, serialization\_format\_opts=None, unversioned: bool = False) {#Package.set} Returns self with the object at logical_key set to entry. @@ -251,6 +252,7 @@ __Arguments__ * __https__: //github.com/quiltdata/quilt/blob/master/api/python/quilt3/formats.py * __serialization_location(string)__: Optional. If passed in, only used if entry is an object. Where the serialized object should be written, e.g. "./mydataframe.parquet" +* __unversioned(bool)__: when True, do not retrieve VersionId for S3 physical keys. __Returns__