Skip to content

Commit

Permalink
Implement support for uint64_t values in ICU backend
Browse files Browse the repository at this point in the history
ICU doesn't support uint64_t directly but provides access to formatting
and parsing of decimal number strings.
Use Boost.Charconv to interface with that.

Fixes #235
  • Loading branch information
Flamefire committed Jan 11, 2025
1 parent 4a3fd05 commit 8a58ec4
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 43 deletions.
2 changes: 1 addition & 1 deletion build/Jamfile.v2
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright 2003 John Maddock
# Copyright 2010 Artyom Beilis
# Copyright 2021 - 2022 Alexander Grund
# Copyright 2021 - 2024 Alexander Grund
#
# Distributed under the Boost Software License, Version 1.0.
# https://www.boost.org/LICENSE_1_0.txt.
Expand Down
65 changes: 50 additions & 15 deletions src/icu/formatter.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
// Copyright (c) 2021-2023 Alexander Grund
// Copyright (c) 2021-2024 Alexander Grund
//
// Distributed under the Boost Software License, Version 1.0.
// https://www.boost.org/LICENSE_1_0.txt
Expand All @@ -13,8 +13,12 @@
#include "icu_util.hpp"
#include "time_zone.hpp"
#include "uconv.hpp"
#include <boost/assert.hpp>
#include <boost/charconv/from_chars.hpp>
#include <boost/charconv/to_chars.hpp>
#include <limits>
#include <memory>
#include <sstream>
#ifdef BOOST_MSVC
# pragma warning(push)
# pragma warning(disable : 4251) // "identifier" : class "type" needs to have dll-interface...
Expand Down Expand Up @@ -62,35 +66,69 @@ namespace boost { namespace locale { namespace impl_icu {
string_type format(int64_t value, size_t& code_points) const override { return do_format(value, code_points); }
string_type format(int32_t value, size_t& code_points) const override { return do_format(value, code_points); }
size_t parse(const string_type& str, double& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, uint64_t& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, int64_t& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, int32_t& value) const override { return do_parse(str, value); }

string_type format(const uint64_t value, size_t& code_points) const override
{
// ICU only supports int64_t as the largest integer type
if(value <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
return format(static_cast<int64_t>(value), code_points);

// Fallback to using a StringPiece (decimal number) as input
char buffer[std::numeric_limits<uint64_t>::digits10 + 2];
auto res = boost::charconv::to_chars(buffer, std::end(buffer), value);
BOOST_ASSERT(res);
*res.ptr = '\0'; // ICU expects a NULL-terminated string even for the StringPiece
icu::UnicodeString tmp;
UErrorCode err = U_ZERO_ERROR;
icu_fmt_.format(icu::StringPiece(buffer, res.ptr - buffer), tmp, nullptr, err);
check_and_throw_icu_error(err);
code_points = tmp.countChar32();
return cvt_.std(tmp);
}

private:
bool get_value(double& v, icu::Formattable& fmt) const
{
UErrorCode err = U_ZERO_ERROR;
v = fmt.getDouble(err);
if(U_FAILURE(err))
return false;
return true;
return U_SUCCESS(err);
}

bool get_value(int64_t& v, icu::Formattable& fmt) const
{
UErrorCode err = U_ZERO_ERROR;
v = fmt.getInt64(err);
return U_SUCCESS(err);
}

bool get_value(uint64_t& v, icu::Formattable& fmt) const
{
UErrorCode err = U_ZERO_ERROR;
// ICU only supports int64_t as the largest integer type
const int64_t tmp = fmt.getInt64(err);
if(U_SUCCESS(err)) {
if(tmp < 0)
return false;
v = static_cast<uint64_t>(tmp);
return true;
}
// Get value as a decimal number and parse that
err = U_ZERO_ERROR;
const auto decimals = fmt.getDecimalNumber(err);
if(U_FAILURE(err))
return false;
return true;
return false; // Not a number
const auto res = boost::charconv::from_chars({decimals.data(), static_cast<size_t>(decimals.length())}, v);
return static_cast<bool>(res);
}

bool get_value(int32_t& v, icu::Formattable& fmt) const
{
UErrorCode err = U_ZERO_ERROR;
v = fmt.getLong(err);
if(U_FAILURE(err))
return false;
return true;
return U_SUCCESS(err);
}

template<typename ValueType>
Expand All @@ -114,14 +152,11 @@ namespace boost { namespace locale { namespace impl_icu {
icu_fmt_.setParseIntegerOnly(std::is_integral<ValueType>::value && isNumberOnly_);
icu_fmt_.parse(tmp, val, pp);

ValueType tmp_v;

if(pp.getIndex() == 0 || !get_value(tmp_v, val))
if(pp.getIndex() == 0 || !get_value(v, val))
return 0;
size_t cut = cvt_.cut(tmp, str.data(), str.data() + str.size(), pp.getIndex());
if(cut == 0)
return 0;
v = tmp_v;
return cut;
}

Expand All @@ -136,11 +171,11 @@ namespace boost { namespace locale { namespace impl_icu {
typedef std::basic_string<CharType> string_type;

string_type format(double value, size_t& code_points) const override { return do_format(value, code_points); }
string_type format(uint64_t value, size_t& code_points) const override { return do_format(value, code_points); }
string_type format(int64_t value, size_t& code_points) const override { return do_format(value, code_points); }

string_type format(int32_t value, size_t& code_points) const override { return do_format(value, code_points); }

size_t parse(const string_type& str, double& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, uint64_t& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, int64_t& value) const override { return do_parse(str, value); }
size_t parse(const string_type& str, int32_t& value) const override { return do_parse(str, value); }

Expand Down
6 changes: 6 additions & 0 deletions src/icu/formatter.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
// Copyright (c) 2024 Alexander Grund
//
// Distributed under the Boost Software License, Version 1.0.
// https://www.boost.org/LICENSE_1_0.txt
Expand Down Expand Up @@ -31,6 +32,8 @@ namespace boost { namespace locale { namespace impl_icu {
/// Format the value and return the number of Unicode code points
virtual string_type format(double value, size_t& code_points) const = 0;
/// Format the value and return the number of Unicode code points
virtual string_type format(uint64_t value, size_t& code_points) const = 0;
/// Format the value and return the number of Unicode code points
virtual string_type format(int64_t value, size_t& code_points) const = 0;
/// Format the value and return the number of Unicode code points
virtual string_type format(int32_t value, size_t& code_points) const = 0;
Expand All @@ -40,6 +43,9 @@ namespace boost { namespace locale { namespace impl_icu {
virtual size_t parse(const string_type& str, double& value) const = 0;
/// Parse the string and return the number of used characters. If it returns 0
/// then parsing failed.
virtual size_t parse(const string_type& str, uint64_t& value) const = 0;
/// Parse the string and return the number of used characters. If it returns 0
/// then parsing failed.
virtual size_t parse(const string_type& str, int64_t& value) const = 0;
/// Parse the string and return the number of used characters. If it returns 0
/// then parsing failed.
Expand Down
41 changes: 16 additions & 25 deletions src/icu/numeric.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
// Copyright (c) 2024 Alexander Grund
//
// Distributed under the Boost Software License, Version 1.0.
// https://www.boost.org/LICENSE_1_0.txt
Expand All @@ -19,41 +20,31 @@
namespace boost { namespace locale { namespace impl_icu {

namespace detail {
template<typename T, bool integer = std::numeric_limits<T>::is_integer>
struct icu_format_type;
template<typename T, typename PreferredType, typename AlternativeType>
struct choose_type_by_digits
: std::conditional<std::numeric_limits<T>::digits <= std::numeric_limits<PreferredType>::digits,
PreferredType,
AlternativeType> {};

template<typename T>
struct icu_format_type<T, true> {
// ICU supports 32 and 64 bit ints, use the former as long as it fits, else the latter
typedef typename std::conditional<std::numeric_limits<T>::digits <= 31, int32_t, int64_t>::type type;
template<typename T, bool integer = std::numeric_limits<T>::is_integer>
struct icu_format_type {
static_assert(sizeof(T) <= sizeof(int64_t), "Only up to 64 bit integer types are supported by ICU");
// ICU supports (only) int32_t and int64_t, use the former as long as it fits, else the latter
using large_type = typename choose_type_by_digits<T, int64_t, uint64_t>::type;
using type = typename choose_type_by_digits<T, int32_t, large_type>::type;
};
template<typename T>
struct icu_format_type<T, false> {
// Only float type ICU supports is double
typedef double type;
};

// ICU does not support uint64_t values so fall back to the parent/std formatting
// if the number is to large to fit into an int64_t
template<typename T,
bool BigUInt = !std::numeric_limits<T>::is_signed && std::numeric_limits<T>::is_integer
&& (sizeof(T) >= sizeof(uint64_t))>
struct use_parent_traits {
static bool use(T /*v*/) { return false; }
};
template<typename T>
struct use_parent_traits<T, true> {
static bool use(T v) { return v > static_cast<T>(std::numeric_limits<int64_t>::max()); }
using type = double;
};

template<typename ValueType>
static bool use_parent(std::ios_base& ios, ValueType v)
static bool use_parent(std::ios_base& ios)
{
const uint64_t flg = ios_info::get(ios).display_flags();
if(flg == flags::posix)
return true;
if(use_parent_traits<ValueType>::use(v))
return true;

if(!std::numeric_limits<ValueType>::is_integer)
return false;
Expand Down Expand Up @@ -105,7 +96,7 @@ namespace boost { namespace locale { namespace impl_icu {
template<typename ValueType>
iter_type do_real_put(iter_type out, std::ios_base& ios, CharType fill, ValueType val) const
{
if(detail::use_parent(ios, val))
if(detail::use_parent<ValueType>(ios))
return std::num_put<CharType>::do_put(out, ios, fill, val);

const std::unique_ptr<formatter_type> formatter = formatter_type::create(ios, loc_, enc_);
Expand Down Expand Up @@ -240,7 +231,7 @@ namespace boost { namespace locale { namespace impl_icu {
do_real_get(iter_type in, iter_type end, std::ios_base& ios, std::ios_base::iostate& err, ValueType& val) const
{
stream_type* stream_ptr = dynamic_cast<stream_type*>(&ios);
if(!stream_ptr || detail::use_parent(ios, ValueType(0)))
if(!stream_ptr || detail::use_parent<ValueType>(ios))
return std::num_get<CharType>::do_get(in, end, ios, err, val);

const std::unique_ptr<formatter_type> formatter = formatter_type::create(ios, loc_, enc_);
Expand Down
59 changes: 57 additions & 2 deletions test/test_formatting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,18 +380,22 @@ void test_manip(std::string e_charset = "UTF-8")
TEST_MIN_MAX(int16_t, "-32,768", "32,767");
TEST_MIN_MAX(uint16_t, "0", "65,535");
TEST_PARSE_FAILS(as::number, "-1", uint16_t);
TEST_PARSE_FAILS(as::number, "-32,767", uint16_t);
if(stdlib_correctly_errors_on_out_of_range_int16())
TEST_PARSE_FAILS(as::number, "65,535", int16_t);

TEST_MIN_MAX(int32_t, "-2,147,483,648", "2,147,483,647");
TEST_MIN_MAX(uint32_t, "0", "4,294,967,295");
TEST_PARSE_FAILS(as::number, "-1", uint32_t);
TEST_PARSE_FAILS(as::number, "-2,147,483,647", uint32_t);
TEST_PARSE_FAILS(as::number, "4,294,967,295", int32_t);

TEST_MIN_MAX(int64_t, "-9,223,372,036,854,775,808", "9,223,372,036,854,775,807");
// ICU does not support uint64, but we have a fallback to format it at least
TEST_MIN_MAX_FMT(as::number, uint64_t, "0", "18446744073709551615");
TEST_MIN_MAX(uint64_t, "0", "18,446,744,073,709,551,615");
TEST_PARSE_FAILS(as::number, "-1", uint64_t);
TEST_PARSE_FAILS(as::number, "-9,223,372,036,854,775,807", uint64_t);
TEST_PARSE_FAILS(as::number, "18,446,744,073,709,551,615", int64_t);
TEST_PARSE_FAILS(as::number, "18,446,744,073,709,551,616", uint64_t);

TEST_FMT_PARSE_3(as::number, std::left, std::setw(3), 15, "15 ");
TEST_FMT_PARSE_3(as::number, std::right, std::setw(3), 15, " 15");
Expand Down Expand Up @@ -857,6 +861,55 @@ void test_format_class(std::string charset = "UTF-8")
TEST_FORMAT_CLS("{1,gmt,ftime='%D'}", a_datetime, "12/31/13");
}

/// Test formatting and parsing of uint64_t values that are not natively supported by ICU.
/// They use a custom code path which gets exercised by this.
void test_uint64_format()
{
#ifdef BOOST_LOCALE_WITH_ICU
std::set<std::string> tested_langs;
int32_t count;
auto* cur_locale = icu::Locale::getAvailableLocales(count);
constexpr uint64_t value = std::numeric_limits<int64_t>::max() + uint64_t(3);
const std::string posix_value = as_posix_string(value);
constexpr int32_t short_value = std::numeric_limits<int32_t>::max();
const std::string posix_short_value = as_posix_string(short_value);
boost::locale::generator g;
const std::string utf8 = ".UTF-8";
// Test with each language supported by ICU to ensure the implementation really
// is independent of the language and doesn't fail e.g. for different separators.
for(int i = 0; i < count; i++, cur_locale++) {
if(!tested_langs.insert(cur_locale->getLanguage()).second)
continue;
TEST_CONTEXT(cur_locale->getName());
UErrorCode err{};
std::unique_ptr<icu::NumberFormat> fmt{icu::NumberFormat::createInstance(*cur_locale, err)};
icu::UnicodeString s;
fmt->format(short_value, s, nullptr, err);
if(U_FAILURE(err))
continue;
const std::string icu_value = boost::locale::conv::utf_to_utf<char>(s.getBuffer(), s.getBuffer() + s.length());
std::stringstream ss;
ss.imbue(g(cur_locale->getName() + utf8));
ss << boost::locale::as::number;
// Sanity check
ss << short_value;
TEST_EQ(ss.str(), icu_value);

// Assumption: Either both the int32 and uint64 values are in POSIX format, or neither are
// This is the case if separators are used and/or numbers are not ASCII
empty_stream(ss) << value;
if(icu_value == posix_short_value)
TEST_EQ(ss.str(), posix_value);
else
TEST_NE(ss.str(), posix_value);

uint64_t parsed_value{};
TEST(ss >> parsed_value);
TEST_EQ(parsed_value, value);
}
#endif
}

BOOST_LOCALE_DISABLE_UNREACHABLE_CODE_WARNING
void test_main(int argc, char** argv)
{
Expand All @@ -867,6 +920,8 @@ void test_main(int argc, char** argv)
std::cout << "ICU is not build... Skipping\n";
return;
#endif
test_uint64_format();

boost::locale::time_zone::global("GMT+4:00");
std::cout << "Testing char, UTF-8" << std::endl;
test_manip<char>();
Expand Down

0 comments on commit 8a58ec4

Please sign in to comment.