From 69e9fce1143b48043578a6f462b1c3f0a66d4a48 Mon Sep 17 00:00:00 2001 From: Benoit Pierre Date: Thu, 31 Oct 2024 23:29:08 +0100 Subject: [PATCH] add markdown support using md4c --- crengine/include/bookformats.h | 3 +- crengine/src/lvdocview.cpp | 32 ++++++++ crengine/src/mdfmt.cpp | 130 +++++++++++++++++++++++++++++++++ crengine/src/mdfmt.h | 42 +++++++++++ 4 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 crengine/src/mdfmt.cpp create mode 100644 crengine/src/mdfmt.h diff --git a/crengine/include/bookformats.h b/crengine/include/bookformats.h index a38a37d1f..2aa854d61 100644 --- a/crengine/include/bookformats.h +++ b/crengine/include/bookformats.h @@ -20,7 +20,8 @@ typedef enum { doc_format_pdb, doc_format_odt, doc_format_svg, - doc_format_max = doc_format_svg + doc_format_md, + doc_format_max = doc_format_md // don't forget update getDocFormatName() when changing this enum } doc_format_t; diff --git a/crengine/src/lvdocview.cpp b/crengine/src/lvdocview.cpp index d1a867709..30696e4aa 100644 --- a/crengine/src/lvdocview.cpp +++ b/crengine/src/lvdocview.cpp @@ -31,6 +31,7 @@ #include "../include/fb3fmt.h" #include "../include/docxfmt.h" #include "../include/odtfmt.h" +#include "mdfmt.h" /// to show page bounds rectangles //#define SHOW_PAGE_RECT @@ -4649,6 +4650,37 @@ bool LVDocView::LoadDocument(LVStreamRef stream, bool metadataOnly) { } #endif +#if (USE_MD4C == 1) + if (DetectMarkdownFormat(m_stream, stream->GetName())) { + CRLog::info("Markdown format detected"); + createEmptyDocument(); + m_doc->setProps(m_doc_props); + setRenderProps(0, 0); + setDocFormat(doc_format_md); + if (m_callback) + m_callback->OnLoadFileFormatDetected(doc_format_md); + updateDocStyleSheet(); + bool res = ImportMarkdownDocument(m_stream, stream->GetName(), m_doc, m_callback, this); + if (!res) { + setDocFormat(doc_format_none); + createDefaultDocument(cs32("ERROR: Error reading Markdown format"), cs32("Cannot open document")); + if (m_callback) { + m_callback->OnLoadFileError(cs32("Error reading Markdown document")); + } + return false; + } else { + setRenderProps(0, 0); + REQUEST_RENDER("loadDocument") + if (m_callback) { + m_callback->OnLoadFileEnd(); + //m_doc->compact(); + m_doc->dumpStatistics(); + } + return true; + } + } +#endif + bool repeat_recursively = false; m_arc = LVOpenArchieve( m_stream ); if (!m_arc.isNull()) diff --git a/crengine/src/mdfmt.cpp b/crengine/src/mdfmt.cpp new file mode 100644 index 000000000..0624dc438 --- /dev/null +++ b/crengine/src/mdfmt.cpp @@ -0,0 +1,130 @@ +/*************************************************************************** + * crengine-ng * + * Copyright (C) 2022,2024 Aleksey Chernov * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of the GNU General Public License * + * as published by the Free Software Foundation; either version 2 * + * of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * + * MA 02110-1301, USA. * + ***************************************************************************/ + +#include "mdfmt.h" + +#if (USE_MD4C == 1) + +#include + +#include + +#include + +#define TEXT_PARSER_CHUNK_SIZE 16384 + +bool DetectMarkdownFormat(LVStreamRef stream, const lString32& fileName) { + // Check file extension + lString32 nm = fileName; + nm = nm.lowercase(); + if (!nm.endsWith(".md")) + return false; + // Check file size + lvsize_t sz = stream->GetSize(); + if (sz < 5 || sz > MARKDOWN_MAX_FILE_SIZE) + return false; + // Checking for compliance with the text format + LVTextParser textParser(stream, NULL, true); + bool res = textParser.CheckFormat(); + stream->SetPos(0); + return res; +} + +typedef struct cre_md4c_parse_data_tag +{ + lString8* htmlData; +} cre_md4c_parse_data; + +static void my_md4c_process_output(const MD_CHAR* chunk, MD_SIZE sz, void* userData) { + cre_md4c_parse_data* data = (cre_md4c_parse_data*)userData; + data->htmlData->append(chunk, sz); +} + +bool ImportMarkdownDocument(LVStreamRef stream, const lString32& fileName, ldomDocument* doc, LVDocViewCallback* progressCallback, CacheLoadingCallback* formatCallback) { + if (doc->openFromCache(formatCallback)) { + if (progressCallback) { + progressCallback->OnLoadFileEnd(); + } + return true; + } + bool res = false; + // Read stream + lString8 rawData; + lString8 htmlData; + char buffer[TEXT_PARSER_CHUNK_SIZE]; + lvsize_t bytesRead = 0; + stream->SetPos(0); + while (stream->Read(buffer, TEXT_PARSER_CHUNK_SIZE, &bytesRead) == LVERR_OK) { + rawData.append(buffer, bytesRead); + if (bytesRead < TEXT_PARSER_CHUNK_SIZE) + break; + } + // Parse and convert to html + cre_md4c_parse_data parseData; + parseData.htmlData = &htmlData; + int parse_res = md_html(rawData.c_str(), rawData.length(), my_md4c_process_output, (void*)&parseData, + MD_FLAG_COLLAPSEWHITESPACE | MD_FLAG_TABLES | MD_FLAG_TASKLISTS | + MD_FLAG_STRIKETHROUGH | MD_FLAG_PERMISSIVEURLAUTOLINKS | + MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS | + MD_FLAG_LATEXMATHSPANS, + 0); + rawData.clear(); + if (0 != parse_res) { + // Parse failed + CRLog::error("MD4C: Failed to parse Markdown document!"); + return res; + } + // Write document content to stream to parse them + lvsize_t result_len = htmlData.length(); + lString32 title = LVExtractFilenameWithoutExtension(fileName); + lString8 gen_preamble = cs8("") + UnicodeToUtf8(title) + cs8(""); + lString8 gen_tail = cs8(""); + lvsize_t dw; + LVStreamRef memStream = LVCreateMemoryStream(); + res = !memStream.isNull(); + if (res) + res = LVERR_OK == memStream->Write(gen_preamble.c_str(), gen_preamble.length(), &dw); + if (res) + res = dw == (lvsize_t)gen_preamble.length(); + if (res) { + res = LVERR_OK == memStream->Write(htmlData.data(), result_len, &dw); + } + htmlData.clear(); + if (res) + res = dw == result_len; + if (res) + res = LVERR_OK == memStream->Write(gen_tail.c_str(), gen_tail.length(), &dw); + if (res) + res = dw == (lvsize_t)gen_tail.length(); + if (res) { + // Parse stream to document + ldomDocumentWriter writer(doc); + LVHTMLParser parser(memStream, &writer); + parser.setProgressCallback(progressCallback); + res = parser.CheckFormat() && parser.Parse(); + } + if (res) { + doc->getProps()->setString(DOC_PROP_TITLE, title); + doc->buildTocFromHeadings(); + } + return res; +} + +#endif diff --git a/crengine/src/mdfmt.h b/crengine/src/mdfmt.h new file mode 100644 index 000000000..cdc4bce7a --- /dev/null +++ b/crengine/src/mdfmt.h @@ -0,0 +1,42 @@ +/*************************************************************************** + * crengine-ng * + * Copyright (C) 2022,2024 Aleksey Chernov * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of the GNU General Public License * + * as published by the Free Software Foundation; either version 2 * + * of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * + * MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef __MDFMT_H_INCLUDED__ +#define __MDFMT_H_INCLUDED__ + +#include + +#if (USE_CMARK_GFM == 1) || (USE_MD4C == 1) + +#include +#include + +class ldomDocument; +class LVDocViewCallback; +class CacheLoadingCallback; + +#define MARKDOWN_MAX_FILE_SIZE 10 * 1024 * 1024 // 10M + +bool DetectMarkdownFormat(LVStreamRef stream, const lString32& fileName); +bool ImportMarkdownDocument(LVStreamRef stream, const lString32& fileName, ldomDocument* doc, LVDocViewCallback* progressCallback, CacheLoadingCallback* formatCallback); + +#endif // USE_CMARK_GFM == 1 + +#endif // __MDFMT_H_INCLUDED__