diff --git a/tsMuxer/convertUTF.cpp b/tsMuxer/convertUTF.cpp index aec111a3..8950a809 100644 --- a/tsMuxer/convertUTF.cpp +++ b/tsMuxer/convertUTF.cpp @@ -402,6 +402,24 @@ Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd) return isLegalUTF8(source, length); } +Boolean isLegalUTF8String(const UTF8* string, int length) +{ + /* same as above, but verify if the whole passed bytestream consists of valid UTF-8 sequences only. */ + const auto stringEnd = string + length; + while (string < stringEnd) + { + const auto seqLength = trailingBytesForUTF8[*string] + 1; + const auto seqEnd = string + seqLength; + /* as the comment for the trailing bytes array notes, valid UTF-8 cannot contain 5- or 6-byte sequences. */ + if (seqLength >= 5 || seqEnd > stringEnd || !isLegalUTF8(string, seqLength)) + { + return false; + } + string = seqEnd; + } + return true; +} + /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, diff --git a/tsMuxer/convertUTF.h b/tsMuxer/convertUTF.h index 62c5e44b..0dd97aa3 100644 --- a/tsMuxer/convertUTF.h +++ b/tsMuxer/convertUTF.h @@ -141,6 +141,8 @@ extern "C" Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd); + Boolean isLegalUTF8String(const UTF8* string, int length); + #ifdef __cplusplus } #endif diff --git a/tsMuxer/srtStreamReader.cpp b/tsMuxer/srtStreamReader.cpp index 34064073..93c3860d 100644 --- a/tsMuxer/srtStreamReader.cpp +++ b/tsMuxer/srtStreamReader.cpp @@ -3,6 +3,7 @@ #include +#include "convertUTF.h" #include "matroskaParser.h" #include "memory.h" #include "vodCoreException.h" @@ -54,7 +55,7 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe if (len < 4) return false; // detect UTF-8/UTF-16/UTF-32 format - if (dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF) + if ((dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF) || isLegalUTF8String(dataStart, len)) { m_charSize = 1; m_srcFormat = UtfConverter::sfUTF8; @@ -97,9 +98,10 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe else { #ifdef _WIN32 + LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to the active code page"); m_srcFormat = UtfConverter::sfANSI; // default value for win32 #else - // m_srcFormat = UtfConverter::sfDefault; + LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to UTF-8"); m_srcFormat = UtfConverter::sfUTF8; #endif } diff --git a/tsMuxer/utf8Converter.cpp b/tsMuxer/utf8Converter.cpp index 440f0fdd..ed9b9027 100644 --- a/tsMuxer/utf8Converter.cpp +++ b/tsMuxer/utf8Converter.cpp @@ -71,7 +71,7 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma { if (widesize == 0) return L""; -#if defined(_WIN32) +#ifdef _WIN32 else if (srcFormat == sfANSI) { wchar_t* widestringnative = new wchar_t[widesize + 1]; @@ -81,39 +81,6 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma delete[] widestringnative; return resultstring; } -#elif __linux__ == 1 - /* - else if (srcFormat == sfDefault) - { - if (cd == 0) - { - sourceEncoding = nl_langinfo(CODESET); - LTRACE(LT_INFO, 2, "Default text encoding: " << sourceEncoding.c_str()); - if (sizeof(wchar_t) == 4) - cd = iconv_open("UTF-32", sourceEncoding.c_str()); - else - cd = iconv_open("UTF-16", sourceEncoding.c_str()); - if (cd == (iconv_t)-1) - THROW(ERR_COMMON, "Can't initialize iconv library for source encoding " << - sourceEncoding); - } - - wchar_t* widestringnative = new wchar_t[widesize+1]; - memset(widestringnative, 0, sizeof(wchar_t) * (widesize+1)); - size_t outLen = (widesize+1) * sizeof(wchar_t); - size_t __widesize = widesize; - char* src = (char*) start; - char* dst = (char*) widestringnative; - size_t wlen = iconv(cd, &src, &__widesize, &dst, &outLen); - if (wlen == -1) - THROW(ERR_COMMON, "Can't convert source string '" << start << "'to unicode"); - widestringnative[widesize] = 0; - - std::wstring resultstring(widestringnative); - delete [] widestringnative; - return resultstring; - } - */ #endif else if (sizeof(wchar_t) == 2) { diff --git a/tsMuxer/utf8Converter.h b/tsMuxer/utf8Converter.h index 93c84ae7..2e32f460 100644 --- a/tsMuxer/utf8Converter.h +++ b/tsMuxer/utf8Converter.h @@ -10,8 +10,7 @@ namespace UtfConverter enum SourceFormat { sfUnknown, - sfANSI, - sfDefault, + sfANSI, // currently active code page (CP_ACP). used only on Windows. sfUTF8, sfUTF16le, sfUTF16be, diff --git a/tsMuxer/vod_common.h b/tsMuxer/vod_common.h index 4b0a21e9..1f81bf82 100644 --- a/tsMuxer/vod_common.h +++ b/tsMuxer/vod_common.h @@ -11,18 +11,21 @@ #if 1 extern bool sLastMsg; -#define LTRACE(level, errIndex, msg) \ - { \ - if (errIndex & 2) \ - { \ - if (level <= LT_WARN) \ - std::cerr << msg << std::endl; \ - else if (level == LT_INFO) \ - std::cout << msg << std::endl; \ - if (level <= LT_INFO) \ - sLastMsg = true; \ - } \ - } +#define LTRACE(level, errIndex, msg) \ + do \ + { \ + { \ + if (errIndex & 2) \ + { \ + if (level <= LT_WARN) \ + std::cerr << msg << std::endl; \ + else if (level == LT_INFO) \ + std::cout << msg << std::endl; \ + if (level <= LT_INFO) \ + sLastMsg = true; \ + } \ + } \ + } while (0) class Process { public: