Skip to content

Commit

Permalink
add markdown support using md4c
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit-pierre committed Oct 31, 2024
1 parent f9e9c3f commit 69e9fce
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 1 deletion.
3 changes: 2 additions & 1 deletion crengine/include/bookformats.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ typedef enum {
doc_format_pdb,
doc_format_odt,
doc_format_svg,
doc_format_max = doc_format_svg
doc_format_md,
doc_format_max = doc_format_md
// don't forget update getDocFormatName() when changing this enum
} doc_format_t;

Expand Down
32 changes: 32 additions & 0 deletions crengine/src/lvdocview.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "../include/fb3fmt.h"
#include "../include/docxfmt.h"
#include "../include/odtfmt.h"
#include "mdfmt.h"

/// to show page bounds rectangles
//#define SHOW_PAGE_RECT
Expand Down Expand Up @@ -4649,6 +4650,37 @@ bool LVDocView::LoadDocument(LVStreamRef stream, bool metadataOnly) {
}
#endif

#if (USE_MD4C == 1)
if (DetectMarkdownFormat(m_stream, stream->GetName())) {
CRLog::info("Markdown format detected");
createEmptyDocument();
m_doc->setProps(m_doc_props);
setRenderProps(0, 0);
setDocFormat(doc_format_md);
if (m_callback)
m_callback->OnLoadFileFormatDetected(doc_format_md);
updateDocStyleSheet();
bool res = ImportMarkdownDocument(m_stream, stream->GetName(), m_doc, m_callback, this);
if (!res) {
setDocFormat(doc_format_none);
createDefaultDocument(cs32("ERROR: Error reading Markdown format"), cs32("Cannot open document"));
if (m_callback) {
m_callback->OnLoadFileError(cs32("Error reading Markdown document"));
}
return false;
} else {
setRenderProps(0, 0);
REQUEST_RENDER("loadDocument")
if (m_callback) {
m_callback->OnLoadFileEnd();
//m_doc->compact();
m_doc->dumpStatistics();
}
return true;
}
}
#endif

bool repeat_recursively = false;
m_arc = LVOpenArchieve( m_stream );
if (!m_arc.isNull())
Expand Down
130 changes: 130 additions & 0 deletions crengine/src/mdfmt.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/***************************************************************************
* crengine-ng *
* Copyright (C) 2022,2024 Aleksey Chernov <[email protected]> *
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, *
* MA 02110-1301, USA. *
***************************************************************************/

#include "mdfmt.h"

#if (USE_MD4C == 1)

#include <lvtinydom.h>

#include <string.h>

#include <md4c-html.h>

#define TEXT_PARSER_CHUNK_SIZE 16384

bool DetectMarkdownFormat(LVStreamRef stream, const lString32& fileName) {
// Check file extension
lString32 nm = fileName;
nm = nm.lowercase();
if (!nm.endsWith(".md"))
return false;
// Check file size
lvsize_t sz = stream->GetSize();
if (sz < 5 || sz > MARKDOWN_MAX_FILE_SIZE)
return false;
// Checking for compliance with the text format
LVTextParser textParser(stream, NULL, true);
bool res = textParser.CheckFormat();
stream->SetPos(0);
return res;
}

typedef struct cre_md4c_parse_data_tag
{
lString8* htmlData;
} cre_md4c_parse_data;

static void my_md4c_process_output(const MD_CHAR* chunk, MD_SIZE sz, void* userData) {
cre_md4c_parse_data* data = (cre_md4c_parse_data*)userData;
data->htmlData->append(chunk, sz);
}

bool ImportMarkdownDocument(LVStreamRef stream, const lString32& fileName, ldomDocument* doc, LVDocViewCallback* progressCallback, CacheLoadingCallback* formatCallback) {
if (doc->openFromCache(formatCallback)) {
if (progressCallback) {
progressCallback->OnLoadFileEnd();
}
return true;
}
bool res = false;
// Read stream
lString8 rawData;
lString8 htmlData;
char buffer[TEXT_PARSER_CHUNK_SIZE];
lvsize_t bytesRead = 0;
stream->SetPos(0);
while (stream->Read(buffer, TEXT_PARSER_CHUNK_SIZE, &bytesRead) == LVERR_OK) {
rawData.append(buffer, bytesRead);
if (bytesRead < TEXT_PARSER_CHUNK_SIZE)
break;
}
// Parse and convert to html
cre_md4c_parse_data parseData;
parseData.htmlData = &htmlData;
int parse_res = md_html(rawData.c_str(), rawData.length(), my_md4c_process_output, (void*)&parseData,
MD_FLAG_COLLAPSEWHITESPACE | MD_FLAG_TABLES | MD_FLAG_TASKLISTS |
MD_FLAG_STRIKETHROUGH | MD_FLAG_PERMISSIVEURLAUTOLINKS |
MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS |
MD_FLAG_LATEXMATHSPANS,
0);
rawData.clear();
if (0 != parse_res) {
// Parse failed
CRLog::error("MD4C: Failed to parse Markdown document!");
return res;
}
// Write document content to stream to parse them
lvsize_t result_len = htmlData.length();
lString32 title = LVExtractFilenameWithoutExtension(fileName);
lString8 gen_preamble = cs8("<html><head><title>") + UnicodeToUtf8(title) + cs8("</title></head><body>");
lString8 gen_tail = cs8("</body></html>");
lvsize_t dw;
LVStreamRef memStream = LVCreateMemoryStream();
res = !memStream.isNull();
if (res)
res = LVERR_OK == memStream->Write(gen_preamble.c_str(), gen_preamble.length(), &dw);
if (res)
res = dw == (lvsize_t)gen_preamble.length();
if (res) {
res = LVERR_OK == memStream->Write(htmlData.data(), result_len, &dw);
}
htmlData.clear();
if (res)
res = dw == result_len;
if (res)
res = LVERR_OK == memStream->Write(gen_tail.c_str(), gen_tail.length(), &dw);
if (res)
res = dw == (lvsize_t)gen_tail.length();
if (res) {
// Parse stream to document
ldomDocumentWriter writer(doc);
LVHTMLParser parser(memStream, &writer);
parser.setProgressCallback(progressCallback);
res = parser.CheckFormat() && parser.Parse();
}
if (res) {
doc->getProps()->setString(DOC_PROP_TITLE, title);
doc->buildTocFromHeadings();
}
return res;
}

#endif
42 changes: 42 additions & 0 deletions crengine/src/mdfmt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/***************************************************************************
* crengine-ng *
* Copyright (C) 2022,2024 Aleksey Chernov <[email protected]> *
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, *
* MA 02110-1301, USA. *
***************************************************************************/

#ifndef __MDFMT_H_INCLUDED__
#define __MDFMT_H_INCLUDED__

#include <crsetup.h>

#if (USE_CMARK_GFM == 1) || (USE_MD4C == 1)

#include <lvstream.h>
#include <lvstring.h>

class ldomDocument;
class LVDocViewCallback;
class CacheLoadingCallback;

#define MARKDOWN_MAX_FILE_SIZE 10 * 1024 * 1024 // 10M

bool DetectMarkdownFormat(LVStreamRef stream, const lString32& fileName);
bool ImportMarkdownDocument(LVStreamRef stream, const lString32& fileName, ldomDocument* doc, LVDocViewCallback* progressCallback, CacheLoadingCallback* formatCallback);

#endif // USE_CMARK_GFM == 1

#endif // __MDFMT_H_INCLUDED__

0 comments on commit 69e9fce

Please sign in to comment.