diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h index 258f9638aaa5d..b2017b2a738a1 100644 --- a/velox/expression/CastExpr-inl.h +++ b/velox/expression/CastExpr-inl.h @@ -727,34 +727,6 @@ void CastExpr::applyCastPrimitives( }); } } - - // If we're converting to a TIMESTAMP, check if we need to adjust the - // current GMT timezone to the user provided session timezone. - if constexpr (ToKind == TypeKind::TIMESTAMP) { - const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); - // If user explicitly asked us to adjust the timezone. - if (queryConfig.adjustTimestampToTimezone()) { - auto sessionTzName = queryConfig.sessionTimezone(); - if (!sessionTzName.empty()) { - // When context.throwOnError is false, some rows will be marked as - // 'failed'. These rows should not be processed further. 'remainingRows' - // will contain a subset of 'rows' that have passed all the checks (e.g. - // keys are not nulls and number of keys and values is the same). - exec::LocalSelectivityVector remainingRows(context, rows); - context.deselectErrors(*remainingRows); - - // locate_zone throws runtime_error if the timezone couldn't be found - // (so we're safe to dereference the pointer). - auto* timeZone = date::locate_zone(sessionTzName); - auto rawTimestamps = resultFlatVector->mutableRawValues(); - - applyToSelectedNoThrowLocal( - context, *remainingRows, result, [&](int row) { - rawTimestamps[row].toGMT(*timeZone); - }); - } - } - } } template diff --git a/velox/expression/PrestoCastHooks.cpp b/velox/expression/PrestoCastHooks.cpp index 1fa866650c801..dce38f2239146 100644 --- a/velox/expression/PrestoCastHooks.cpp +++ b/velox/expression/PrestoCastHooks.cpp @@ -32,7 +32,22 @@ PrestoCastHooks::PrestoCastHooks(const core::QueryConfig& config) } Timestamp PrestoCastHooks::castStringToTimestamp(const StringView& view) const { - return util::fromTimestampString(view.data(), view.size()); + auto result = util::fromTimestampWithTimezoneString(view.data(), view.size()); + + // If the parsed string has timezone information, convert the timestamp at + // GMT at that time. For example, "1970-01-01 00:00:00 -00:01" is 60 seconds + // at GMT. + if (result.second != -1) { + result.first.toGMT(result.second); + + } + // If no timezone information is available in the input string, check if we + // should understand it as being at the session timezone, and if so, convert + // to GMT. + else if (options_.timeZone != nullptr) { + result.first.toGMT(*options_.timeZone); + } + return result.first; } int32_t PrestoCastHooks::castStringToDate(const StringView& dateString) const { diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 25f904a08146d..82321db4f296e 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -589,6 +589,24 @@ TEST_F(CastExprTest, stringToTimestamp) { std::nullopt, }; testCast("timestamp", input, expected); + + // Try with a different session timezone. + setTimezone("America/Los_Angeles"); + input = { + "1970-01-01 00:00", + "1970-01-01 00:00 +00:00", + "1970-01-01 00:00 +01:00", + "1970-01-01 00:00 America/Sao_Paulo", + "2000-01-01 12:21:56Z", + }; + expected = { + Timestamp(28800, 0), + Timestamp(0, 0), + Timestamp(-3600, 0), + Timestamp(10800, 0), + Timestamp(946729316, 0), + }; + testCast("timestamp", input, expected); } TEST_F(CastExprTest, timestampToString) { @@ -765,7 +783,7 @@ TEST_F(CastExprTest, timestampAdjustToTimezone) { Timestamp(946713600, 0), Timestamp(0, 0), Timestamp(946758116, 0), - Timestamp(-21600, 0), + Timestamp(-50400, 0), std::nullopt, Timestamp(957164400, 0), }); diff --git a/velox/type/TimestampConversion.cpp b/velox/type/TimestampConversion.cpp index e3ab6885e609f..4d7e88e8ffb51 100644 --- a/velox/type/TimestampConversion.cpp +++ b/velox/type/TimestampConversion.cpp @@ -43,6 +43,7 @@ #include "velox/type/TimestampConversion.h" #include "velox/common/base/CheckedArithmetic.h" #include "velox/common/base/Exceptions.h" +#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::util { @@ -161,6 +162,13 @@ inline bool validDate(int64_t daysSinceEpoch) { daysSinceEpoch <= std::numeric_limits::max(); } +// Skip leading spaces. +inline void skipSpaces(const char* buf, size_t len, size_t& pos) { + while (pos < len && characterIsSpace(buf[pos])) { + pos++; + } +} + bool tryParseDateString( const char* buf, size_t len, @@ -177,11 +185,7 @@ bool tryParseDateString( int32_t year = 0; bool yearneg = false; int sep; - - // Skip leading spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); if (pos >= len) { return false; @@ -320,10 +324,8 @@ bool tryParseDateString( // In strict mode, check remaining string for non-space characters. if (mode == ParseMode::kStrict) { - // Skip trailing spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); + // Check position. if end was not reached, non-space chars remaining. if (pos < len) { return false; @@ -353,11 +355,7 @@ bool tryParseTimeString( if (len == 0) { return false; } - - // Skip leading spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); if (pos >= len) { return false; @@ -424,10 +422,7 @@ bool tryParseTimeString( // In strict mode, check remaining string for non-space characters. if (strict) { - // Skip trailing spaces. - while (pos < len && characterIsSpace(buf[pos])) { - pos++; - } + skipSpaces(buf, len, pos); // Check position. If end was not reached, non-space chars remaining. if (pos < len) { @@ -438,6 +433,43 @@ bool tryParseTimeString( return true; } +// String format is "YYYY-MM-DD hh:mm:ss.microseconds" (seconds and microseconds +// are optional). ISO 8601 +bool tryParseTimestampString( + const char* buf, + size_t len, + size_t& pos, + Timestamp& result) { + int64_t daysSinceEpoch = 0; + int64_t microsSinceMidnight = 0; + + if (!tryParseDateString( + buf, len, pos, daysSinceEpoch, ParseMode::kNonStrict)) { + return false; + } + + if (pos == len) { + // No time: only a date. + result = fromDatetime(daysSinceEpoch, 0); + return true; + } + + // Try to parse a time field. + if (buf[pos] == ' ' || buf[pos] == 'T') { + pos++; + } + + size_t timePos = 0; + if (!tryParseTimeString( + buf + pos, len - pos, timePos, microsSinceMidnight, false)) { + return false; + } + + pos += timePos; + result = fromDatetime(daysSinceEpoch, microsSinceMidnight); + return true; +} + bool tryParseUTCOffsetString( const char* buf, size_t& pos, @@ -702,55 +734,53 @@ namespace { Timestamp fromTimestampString(const char* str, size_t len) { size_t pos; - int64_t daysSinceEpoch; - int64_t microsSinceMidnight; + Timestamp resultTimestamp; - if (!tryParseDateString( - str, len, pos, daysSinceEpoch, ParseMode::kNonStrict)) { + if (!tryParseTimestampString(str, len, pos, resultTimestamp)) { parserError(str, len); } + skipSpaces(str, len, pos); - if (pos == len) { - // No time: only a date. - return fromDatetime(daysSinceEpoch, 0); + if (pos >= len) { + return resultTimestamp; } + parserError(str, len); +} - // Try to parse a time field. - if (str[pos] == ' ' || str[pos] == 'T') { - pos++; - } +std::pair fromTimestampWithTimezoneString( + const char* str, + size_t len) { + size_t pos; + Timestamp resultTimestamp; - size_t timePos = 0; - if (!tryParseTimeString( - str + pos, len - pos, timePos, microsSinceMidnight, false)) { + if (!tryParseTimestampString(str, len, pos, resultTimestamp)) { parserError(str, len); } - pos += timePos; - auto timestamp = fromDatetime(daysSinceEpoch, microsSinceMidnight); + int64_t timezoneID = -1; + skipSpaces(str, len, pos); + // If there is anything left to parse, it must be a timezone definition. if (pos < len) { - // Skip a "Z" at the end (as per the ISO 8601 specs). - if (str[pos] == 'Z') { - pos++; + size_t timezonePos = pos; + while (timezonePos < len && !characterIsSpace(str[timezonePos])) { + timezonePos++; } - int hourOffset, minuteOffset; - if (tryParseUTCOffsetString(str, pos, len, hourOffset, minuteOffset)) { - int32_t secondOffset = - (hourOffset * kSecsPerHour) + (minuteOffset * kSecsPerMinute); - timestamp = Timestamp( - timestamp.getSeconds() - secondOffset, timestamp.getNanos()); + std::string_view timezone(str + pos, timezonePos - pos); + + if ((timezoneID = util::getTimeZoneID(timezone, false)) == -1) { + VELOX_USER_FAIL("Unknown timezone value: \"{}\"", timezone); } // Skip any spaces at the end. - while (pos < len && characterIsSpace(str[pos])) { - pos++; - } + pos = timezonePos; + skipSpaces(str, len, pos); + if (pos < len) { parserError(str, len); } } - return timestamp; + return {resultTimestamp, timezoneID}; } } // namespace facebook::velox::util diff --git a/velox/type/TimestampConversion.h b/velox/type/TimestampConversion.h index 9bae1aa632ea7..cf723ac43eb39 100644 --- a/velox/type/TimestampConversion.h +++ b/velox/type/TimestampConversion.h @@ -131,14 +131,51 @@ inline int64_t fromTimeString(const StringView& str) { // Timestamp conversion -/// Parses a full ISO 8601 timestamp string, following the format -/// "YYYY-MM-DD HH:MM[:SS[.MS]] +00:00" +/// Parses a full ISO 8601 timestamp string, following the format: +/// +/// "YYYY-MM-DD HH:MM[:SS[.MS]]" +/// +/// Seconds and miliseconds are optional. +/// +/// This function does not accept any timezone information in the string (e.g. +/// UTC, Z, or a timezone offsets). This is because the returned timestamp does +/// not contain timezone information; therefore, it would either be required for +/// this function to convert the parsed timestamp (but we don't know the +/// original timezone), or ignore the timezone information, which would be +/// incorecct. +/// +/// For a timezone-aware version of this function, check +/// `fromTimestampWithTimezoneString()` below. Timestamp fromTimestampString(const char* buf, size_t len); inline Timestamp fromTimestampString(const StringView& str) { return fromTimestampString(str.data(), str.size()); } +/// Parses a full ISO 8601 timestamp string, following the format: +/// +/// "YYYY-MM-DD HH:MM[:SS[.MS]] +00:00" +/// +/// This is a timezone-aware version of the function above +/// `fromTimestampString()` which returns both the parsed timestamp and the +/// timezone ID. It is up to the client to do the expected conversion based on +/// these two values. +/// +/// The timezone information at the end of the string may contain a timezone +/// name (as defined in velox/type/tz/*), such as "UTC" or +/// "America/Los_Angeles", or a timezone offset, like "+06:00" or "-09:30". The +/// white space between the hour definition and timestamp is optional. +/// +/// -1 means no timezone information was found. Throws VeloxUserError in case of +/// parsing errors. +std::pair fromTimestampWithTimezoneString( + const char* buf, + size_t len); + +inline auto fromTimestampWithTimezoneString(const StringView& str) { + return fromTimestampWithTimezoneString(str.data(), str.size()); +} + Timestamp fromDatetime(int64_t daysSinceEpoch, int64_t microsSinceMidnight); } // namespace facebook::velox::util diff --git a/velox/type/tests/ConversionsTest.cpp b/velox/type/tests/ConversionsTest.cpp index 84890bcf66a9a..5ea162f0f8122 100644 --- a/velox/type/tests/ConversionsTest.cpp +++ b/velox/type/tests/ConversionsTest.cpp @@ -1012,14 +1012,14 @@ TEST_F(ConversionsTest, toTimestamp) { "2000-01-01", "1970-01-01 00:00:00", "2000-01-01 12:21:56", - "1970-01-01 00:00:00-02:00", + "1970-01-01 00:01", }, { Timestamp(0, 0), Timestamp(946684800, 0), Timestamp(0, 0), Timestamp(946729316, 0), - Timestamp(7200, 0), + Timestamp(60, 0), }); // Invalid case. diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index c7239d08783d3..371c8ad33d405 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -255,6 +255,7 @@ TEST(DateTimeUtilTest, fromTimestampString) { EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00")); EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00:00")); + EXPECT_EQ(Timestamp(0, 0), fromTimestampString("1970-01-01 00:00:00 ")); EXPECT_EQ( Timestamp(946729316, 0), fromTimestampString("2000-01-01 12:21:56")); @@ -262,33 +263,68 @@ TEST(DateTimeUtilTest, fromTimestampString) { Timestamp(946729316, 0), fromTimestampString("2000-01-01T12:21:56")); EXPECT_EQ( Timestamp(946729316, 0), fromTimestampString("2000-01-01T 12:21:56")); - - // Test UTC offsets. - EXPECT_EQ( - Timestamp(7200, 0), fromTimestampString("1970-01-01 00:00:00-02:00")); - EXPECT_EQ( - Timestamp(946697400, 0), - fromTimestampString("2000-01-01 00:00:00Z-03:30")); - EXPECT_EQ( - Timestamp(1587583417, 0), - fromTimestampString("2020-04-23 04:23:37+09:00")); } -TEST(DateTimeUtilTest, fromTimestampStrInvalid) { +TEST(DateTimeUtilTest, fromTimestampStringInvalid) { // Needs at least a date. EXPECT_THROW(fromTimestampString(""), VeloxUserError); EXPECT_THROW(fromTimestampString("00:00:00"), VeloxUserError); - // Broken UTC offsets. - EXPECT_THROW(fromTimestampString("1970-01-01 00:00:00-asd"), VeloxUserError); - EXPECT_THROW( - fromTimestampString("1970-01-01 00:00:00+00:00:00"), VeloxUserError); - // Integer overflow during timestamp parsing. EXPECT_THROW( fromTimestampString("2773581570-01-01 00:00:00-asd"), VeloxUserError); EXPECT_THROW( fromTimestampString("-2147483648-01-01 00:00:00-asd"), VeloxUserError); + + // Unexpected timezone definition. + EXPECT_THROW( + fromTimestampString("1970-01-01 00:00:00 a"), VeloxUserError); + EXPECT_THROW(fromTimestampString("1970-01-01 00:00:00Z"), VeloxUserError); + EXPECT_THROW(fromTimestampString("1970-01-01 00:00:00Z"), VeloxUserError); + EXPECT_THROW(fromTimestampString("1970-01-01 00:00:00 UTC"), VeloxUserError); + EXPECT_THROW( + fromTimestampString("1970-01-01 00:00:00 America/Los_Angeles"), + VeloxUserError); + + // Parse timestamp with (broken) timezones. + EXPECT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00-asd"), + VeloxUserError); + EXPECT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00Z UTC"), + VeloxUserError); + EXPECT_THROW( + fromTimestampWithTimezoneString("1970-01-01 00:00:00+00:00:00"), + VeloxUserError); +} + +TEST(DateTimeUtilTest, fromTimestampWithTimezoneString) { + // -1 means no timezone information. + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:00:00"), + std::make_pair(Timestamp(0, 0), -1L)); + + // Test timezone offsets. + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:00:00 -02:00"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("-02:00"))); + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:00:00+13:36"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("+13:36"))); + + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:00:00Z"), + std::make_pair(Timestamp(0, 0), util::getTimeZoneID("UTC"))); + + EXPECT_EQ( + fromTimestampWithTimezoneString("1970-01-01 00:01:00 UTC"), + std::make_pair(Timestamp(60, 0), util::getTimeZoneID("UTC"))); + + EXPECT_EQ( + fromTimestampWithTimezoneString( + "1970-01-01 00:00:01 America/Los_Angeles"), + std::make_pair( + Timestamp(1, 0), util::getTimeZoneID("America/Los_Angeles"))); } TEST(DateTimeUtilTest, toGMT) {