Skip to content

Commit

Permalink
Merge branch 'release-4.0.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
mpenkov committed Nov 24, 2020
2 parents a0fa7fc + e3bf467 commit a68da3a
Show file tree
Hide file tree
Showing 14 changed files with 203 additions and 47 deletions.
12 changes: 5 additions & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,6 @@ os: linux

jobs:
include:
- python: '3.7'
name: "flake8"
env: TOXENV="flake8"

- python: '3.5'
env: TOXENV="check_keys,py35-test,py35-integration"

- python: '3.6'
env:
- SO_BUCKET: smart-open
Expand All @@ -27,11 +20,16 @@ jobs:
- BOTO_CONFIG: "/dev/null"
- SO_ENABLE_MOTO_SERVER: "1"
- TOXENV: "check_keys,py37-doctest,enable_moto_server,py37-test,py37-benchmark,py37-integration,disable_moto_server"

- python: '3.8'
env:
- BOTO_CONFIG: "/dev/null"
- TOXENV: "check_keys,py38-doctest,test_coverage,py38-integration"

- python: '3.8'
name: "flake8"
env: TOXENV="flake8"

install:
- pip install tox

Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Unreleased

# 4.0.0, 24 Nov 2020

- Fix reading empty file or seeking past end of file for s3 backend (PR [#549](https://github.com/RaRe-Technologies/smart_open/pull/549), [@jcushman](https://github.com/jcushman))
- Fix handling of rt/wt mode when working with gzip compression (PR [#559](https://github.com/RaRe-Technologies/smart_open/pull/559), [@mpenkov](https://github.com/mpenkov))
- Bump minimum Python version to 3.6 (PR [#562](https://github.com/RaRe-Technologies/smart_open/pull/562), [@mpenkov](https://github.com/mpenkov))

# 3.0.0, 8 Oct 2020

This release modifies the behavior of setup.py with respect to dependencies.
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ Installation
pip install smart_open // Install with no cloud dependencies
pip install smart_open[s3] // Install S3 deps
pip install smart_open[gcp] // Install GCP deps
pip install smart_open[azure] // Install Azure deps
pip install smart_open[all] // Installs all cloud dependencies

Or, if you prefer to install from the `source tar.gz <http://pypi.python.org/pypi/smart_open>`_::
Expand Down
1 change: 0 additions & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ environment:
TOX_PARALLEL_NO_SPINNER: 1

matrix:
- TOXENV: "check_keys,py35-test"
- TOXENV: "check_keys,py36-test"
- TOXENV: "check_keys,py37-test"
- TOXENV: "check_keys,py38-test"
Expand Down
9 changes: 6 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ def read(fname):
aws_deps = ['boto3']
gcp_deps = ['google-cloud-storage']
azure_deps = ['azure-storage-blob', 'azure-common', 'azure-core']
http_deps = ['requests']

all_deps = install_requires + aws_deps + gcp_deps + azure_deps
all_deps = install_requires + aws_deps + gcp_deps + azure_deps + http_deps

setup(
name='smart_open',
Expand Down Expand Up @@ -90,8 +91,10 @@ def read(fname):
'gcp': gcp_deps,
'azure': azure_deps,
'all': all_deps,
'http': http_deps,
'webhdfs': http_deps,
},
python_requires=">=3.5.*",
python_requires=">=3.6.*",

test_suite="smart_open.tests",

Expand All @@ -101,9 +104,9 @@ def read(fname):
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Topic :: System :: Distributed Computing',
'Topic :: Database :: Front-Ends',
],
Expand Down
5 changes: 4 additions & 1 deletion smart_open/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
import os.path
import urllib.parse

import requests
try:
import requests
except ImportError:
MISSING_DEPS = True

from smart_open import bytebuffer, constants
import smart_open.utils
Expand Down
17 changes: 4 additions & 13 deletions smart_open/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
_SLEEP_SECONDS = 10

# Returned by AWS when we try to seek beyond EOF.
_OUT_OF_RANGE = 'Requested Range Not Satisfiable'
_OUT_OF_RANGE = 'InvalidRange'


def parse_uri(uri_as_string):
Expand Down Expand Up @@ -385,18 +385,9 @@ def _open_body(self, start=None, stop=None):
except IOError as ioe:
# Handle requested content range exceeding content size.
error_response = _unwrap_ioerror(ioe)
if error_response is None or error_response.get('Message') != _OUT_OF_RANGE:
if error_response is None or error_response.get('Code') != _OUT_OF_RANGE:
raise
try:
self._position = self._content_length = int(error_response['ActualObjectSize'])
except KeyError:
# This shouldn't happen with real S3, but moto lacks ActualObjectSize.
# Reported at https://github.com/spulec/moto/issues/2981
self._position = self._content_length = _get(
self._object,
version=self._version_id,
**self._object_kwargs,
)['ContentLength']
self._position = self._content_length = int(error_response['ActualObjectSize'])
self._body = io.BytesIO()
else:
units, start, stop, length = smart_open.utils.parse_content_range(response['ContentRange'])
Expand Down Expand Up @@ -1012,7 +1003,7 @@ def iter_bucket(
bucket_name: str
The name of the bucket.
prefix: str, optional
Limits the iteration to keys starting wit the prefix.
Limits the iteration to keys starting with the prefix.
accept_key: callable, optional
This is a function that accepts a key name (unicode string) and
returns True/False, signalling whether the given key should be downloaded.
Expand Down
71 changes: 62 additions & 9 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@

SYSTEM_ENCODING = sys.getdefaultencoding()

_TO_BINARY_LUT = {
'r': 'rb', 'r+': 'rb+', 'rt': 'rb', 'rt+': 'rb+',
'w': 'wb', 'w+': 'wb+', 'wt': 'wb', "wt+": 'wb+',
'a': 'ab', 'a+': 'ab+', 'at': 'ab', 'at+': 'ab+',
}


def _sniff_scheme(uri_as_string):
"""Returns the scheme of the URL only, as a string."""
Expand Down Expand Up @@ -218,12 +212,17 @@ def open(
# filename ---------------> bytes -------------> bytes ---------> text
# binary decompressed decode
#
binary_mode = _TO_BINARY_LUT.get(mode, mode)

try:
binary_mode = _get_binary_mode(mode)
except ValueError as ve:
raise NotImplementedError(ve.args[0])

binary = _open_binary_stream(uri, binary_mode, transport_params)
if ignore_ext:
decompressed = binary
else:
decompressed = compression.compression_wrapper(binary, mode)
decompressed = compression.compression_wrapper(binary, binary_mode)

if 'b' not in mode or explicit_encoding is not None:
decoded = _encoding_wrapper(decompressed, mode, encoding=encoding, errors=errors)
Expand All @@ -233,6 +232,60 @@ def open(
return decoded


def _get_binary_mode(mode_str):
#
# https://docs.python.org/3/library/functions.html#open
#
# The order of characters in the mode parameter appears to be unspecified.
# The implementation follows the examples, just to be safe.
#
mode = list(mode_str)
binmode = []

if 't' in mode and 'b' in mode:
raise ValueError("can't have text and binary mode at once")

counts = [mode.count(x) for x in 'rwa']
if sum(counts) > 1:
raise ValueError("must have exactly one of create/read/write/append mode")

def transfer(char):
binmode.append(mode.pop(mode.index(char)))

if 'a' in mode:
transfer('a')
elif 'w' in mode:
transfer('w')
elif 'r' in mode:
transfer('r')
else:
raise ValueError(
"Must have exactly one of create/read/write/append "
"mode and at most one plus"
)

if 'b' in mode:
transfer('b')
elif 't' in mode:
mode.pop(mode.index('t'))
binmode.append('b')
else:
binmode.append('b')

if '+' in mode:
transfer('+')

#
# There shouldn't be anything left in the mode list at this stage.
# If there is, then either we've missed something and the implementation
# of this function is broken, or the original input mode is invalid.
#
if mode:
raise ValueError('invalid mode: %r' % mode_str)

return ''.join(binmode)


def _shortcut_open(
uri,
mode,
Expand Down Expand Up @@ -317,7 +370,7 @@ def _open_binary_stream(uri, mode, transport_params):
return uri

if not isinstance(uri, str):
raise TypeError("don't know how to handle uri %r" % uri)
raise TypeError("don't know how to handle uri %s" % repr(uri))

scheme = _sniff_scheme(uri)
submodule = transport.get_transport(scheme)
Expand Down
2 changes: 2 additions & 0 deletions smart_open/tests/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# This code is distributed under the terms and conditions
# from the MIT License (MIT).
#
import os
import unittest

import responses
Expand Down Expand Up @@ -38,6 +39,7 @@ def request_callback(request):
return (200, HEADERS, BYTES[start:end])


@unittest.skipIf(os.environ.get('TRAVIS'), 'This test does not work on TravisCI for some reason')
class HttpTest(unittest.TestCase):

@responses.activate
Expand Down
45 changes: 43 additions & 2 deletions smart_open/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,27 @@ def ignore_resource_warnings():
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<ssl.SSLSocket.*>") # noqa


@contextmanager
def patch_invalid_range_response(actual_size):
""" Work around a bug in moto (https://github.com/spulec/moto/issues/2981) where the
API response doesn't match when requesting an invalid range of bytes from an S3 GetObject. """
_real_get = smart_open.s3._get

def mock_get(*args, **kwargs):
try:
return _real_get(*args, **kwargs)
except IOError as ioe:
error_response = smart_open.s3._unwrap_ioerror(ioe)
if error_response and error_response.get('Message') == 'Requested Range Not Satisfiable':
error_response['ActualObjectSize'] = actual_size
error_response['Code'] = 'InvalidRange'
error_response['Message'] = 'The requested range is not satisfiable'
raise

with patch('smart_open.s3._get', new=mock_get):
yield


class BaseTest(unittest.TestCase):
@contextmanager
def assertApiCalls(self, **expected_api_calls):
Expand Down Expand Up @@ -236,6 +257,15 @@ def test_seek_end(self):
self.assertEqual(seek, len(content) - 4)
self.assertEqual(fin.read(), b'you?')

def test_seek_past_end(self):
content = u"hello wořld\nhow are you?".encode('utf8')
put_to_bucket(contents=content)

with self.assertApiCalls(GetObject=1), patch_invalid_range_response(str(len(content))):
fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True)
seek = fin.seek(60)
self.assertEqual(seek, len(content))

def test_detect_eof(self):
content = u"hello wořld\nhow are you?".encode('utf8')
put_to_bucket(contents=content)
Expand Down Expand Up @@ -352,6 +382,15 @@ def test_defer_seek(self):
fin.seek(10)
self.assertEqual(fin.read(), content[10:])

def test_read_empty_file(self):
put_to_bucket(contents=b'')

with self.assertApiCalls(GetObject=1), patch_invalid_range_response('0'):
with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin:
data = fin.read()

self.assertEqual(data, b'')


@moto.mock_s3
class MultipartWriterTest(unittest.TestCase):
Expand Down Expand Up @@ -426,7 +465,8 @@ def test_write_04(self):
pass

# read back the same key and check its content
output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb'))
with patch_invalid_range_response('0'):
output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb'))

self.assertEqual(output, [])

Expand Down Expand Up @@ -548,7 +588,8 @@ def test_write_04(self):
pass

# read back the same key and check its content
output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb'))
with patch_invalid_range_response('0'):
output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb'))
self.assertEqual(output, [])

def test_buffered_writer_wrapper_works(self):
Expand Down
Loading

0 comments on commit a68da3a

Please sign in to comment.