Skip to content

Commit

Permalink
Detect UTF-8 in SRT files
Browse files Browse the repository at this point in the history
Also, remove unused code in utf8Converter.cpp and remove the "sfDefault" from
the SourceFormat enum, whose meaning was platform-dependent anyway and did not
make sense anymore, as we now default to UTF-8 on non-Windows platforms anyway.
Processing falls back to the ACP on Windows to keep backwards compatibility.
  • Loading branch information
lighterowl committed Feb 8, 2020
1 parent 355151c commit 0035efb
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 50 deletions.
18 changes: 18 additions & 0 deletions tsMuxer/convertUTF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,24 @@ Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd)
return isLegalUTF8(source, length);
}

Boolean isLegalUTF8String(const UTF8* string, int length)
{
/* same as above, but verify if the whole passed bytestream consists of valid UTF-8 sequences only. */
const auto stringEnd = string + length;
while (string < stringEnd)
{
const auto seqLength = trailingBytesForUTF8[*string] + 1;
const auto seqEnd = string + seqLength;
/* as the comment for the trailing bytes array notes, valid UTF-8 cannot contain 5- or 6-byte sequences. */
if (seqLength >= 5 || seqEnd > stringEnd || !isLegalUTF8(string, seqLength))
{
return false;
}
string = seqEnd;
}
return true;
}

/* --------------------------------------------------------------------- */

ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart,
Expand Down
2 changes: 2 additions & 0 deletions tsMuxer/convertUTF.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ extern "C"

Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);

Boolean isLegalUTF8String(const UTF8* string, int length);

#ifdef __cplusplus
}
#endif
Expand Down
6 changes: 4 additions & 2 deletions tsMuxer/srtStreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <string>

#include "convertUTF.h"
#include "matroskaParser.h"
#include "memory.h"
#include "vodCoreException.h"
Expand Down Expand Up @@ -54,7 +55,7 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe
if (len < 4)
return false;
// detect UTF-8/UTF-16/UTF-32 format
if (dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF)
if ((dataStart[0] == 0xEF && dataStart[1] == 0xBB && dataStart[2] == 0xBF) || isLegalUTF8String(dataStart, len))
{
m_charSize = 1;
m_srcFormat = UtfConverter::sfUTF8;
Expand Down Expand Up @@ -97,9 +98,10 @@ bool SRTStreamReader::detectSrcFormat(uint8_t* dataStart, int len, int& prefixLe
else
{
#ifdef _WIN32
LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to the active code page");
m_srcFormat = UtfConverter::sfANSI; // default value for win32
#else
// m_srcFormat = UtfConverter::sfDefault;
LTRACE(LT_INFO, 2, "Failed to auto-detect SRT encoding : falling back to UTF-8");
m_srcFormat = UtfConverter::sfUTF8;
#endif
}
Expand Down
35 changes: 1 addition & 34 deletions tsMuxer/utf8Converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma
{
if (widesize == 0)
return L"";
#if defined(_WIN32)
#ifdef _WIN32
else if (srcFormat == sfANSI)
{
wchar_t* widestringnative = new wchar_t[widesize + 1];
Expand All @@ -81,39 +81,6 @@ std::wstring toWideString(uint8_t* start, size_t widesize, SourceFormat srcForma
delete[] widestringnative;
return resultstring;
}
#elif __linux__ == 1
/*
else if (srcFormat == sfDefault)
{
if (cd == 0)
{
sourceEncoding = nl_langinfo(CODESET);
LTRACE(LT_INFO, 2, "Default text encoding: " << sourceEncoding.c_str());
if (sizeof(wchar_t) == 4)
cd = iconv_open("UTF-32", sourceEncoding.c_str());
else
cd = iconv_open("UTF-16", sourceEncoding.c_str());
if (cd == (iconv_t)-1)
THROW(ERR_COMMON, "Can't initialize iconv library for source encoding " <<
sourceEncoding);
}
wchar_t* widestringnative = new wchar_t[widesize+1];
memset(widestringnative, 0, sizeof(wchar_t) * (widesize+1));
size_t outLen = (widesize+1) * sizeof(wchar_t);
size_t __widesize = widesize;
char* src = (char*) start;
char* dst = (char*) widestringnative;
size_t wlen = iconv(cd, &src, &__widesize, &dst, &outLen);
if (wlen == -1)
THROW(ERR_COMMON, "Can't convert source string '" << start << "'to unicode");
widestringnative[widesize] = 0;
std::wstring resultstring(widestringnative);
delete [] widestringnative;
return resultstring;
}
*/
#endif
else if (sizeof(wchar_t) == 2)
{
Expand Down
3 changes: 1 addition & 2 deletions tsMuxer/utf8Converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ namespace UtfConverter
enum SourceFormat
{
sfUnknown,
sfANSI,
sfDefault,
sfANSI, // currently active code page (CP_ACP). used only on Windows.
sfUTF8,
sfUTF16le,
sfUTF16be,
Expand Down
27 changes: 15 additions & 12 deletions tsMuxer/vod_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,21 @@

#if 1
extern bool sLastMsg;
#define LTRACE(level, errIndex, msg) \
{ \
if (errIndex & 2) \
{ \
if (level <= LT_WARN) \
std::cerr << msg << std::endl; \
else if (level == LT_INFO) \
std::cout << msg << std::endl; \
if (level <= LT_INFO) \
sLastMsg = true; \
} \
}
#define LTRACE(level, errIndex, msg) \
do \
{ \
{ \
if (errIndex & 2) \
{ \
if (level <= LT_WARN) \
std::cerr << msg << std::endl; \
else if (level == LT_INFO) \
std::cout << msg << std::endl; \
if (level <= LT_INFO) \
sLastMsg = true; \
} \
} \
} while (0)
class Process
{
public:
Expand Down

3 comments on commit 0035efb

@abakum
Copy link
Contributor

@abakum abakum commented on 0035efb Feb 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xavery You are the best!

@abakum
Copy link
Contributor

@abakum abakum commented on 0035efb Feb 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks to @xavery, the torment with writing .srt in UTF-8 with BOM ended.
image
Now you can write .srt in UTF-8 without BOM or even in 8-bit active code page encoding for example, in cp1251

@abakum
Copy link
Contributor

@abakum abakum commented on 0035efb Feb 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

Please sign in to comment.