Skip to content

Commit

Permalink
Merge pull request #3435 from omnivore-app/fix/readability
Browse files Browse the repository at this point in the history
fix/readability
  • Loading branch information
sywhb authored Jan 25, 2024
2 parents 7567e00 + fd7c2ff commit 7f6b722
Show file tree
Hide file tree
Showing 15 changed files with 26,355 additions and 263 deletions.
2 changes: 1 addition & 1 deletion packages/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,4 @@
"volta": {
"extends": "../../package.json"
}
}
}
15 changes: 5 additions & 10 deletions packages/api/src/jobs/save_page.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { Readability } from '@omnivore/readability'
import axios from 'axios'
import jwt from 'jsonwebtoken'
import { promisify } from 'util'
Expand Down Expand Up @@ -63,7 +62,6 @@ interface FetchResult {
title?: string
content?: string
contentType?: string
readabilityResult?: Readability.ParseResult
}

const isFetchResult = (obj: unknown): obj is FetchResult => {
Expand Down Expand Up @@ -297,7 +295,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {

// get the fetch result from cache
const fetchedResult = await getCachedFetchResult(url)
const { title, contentType, readabilityResult } = fetchedResult
const { title, contentType } = fetchedResult
let content = fetchedResult.content

// for pdf content, we need to upload the pdf
Expand Down Expand Up @@ -350,7 +348,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state: state ? (state as ArticleSavingRequestStatus) : undefined,
labels: labels,
rssFeedUrl,
Expand All @@ -362,13 +359,11 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
user
)

// if (result.__typename == 'SaveError') {
// logger.error('Error saving page', { userId, url, result })
// throw new Error('Error saving page')
// }
if (result.__typename == 'SaveError') {
throw new Error(result.message || result.errorCodes[0])
}

// if the readability result is not parsed, the import is failed
isImported = !!readabilityResult
isImported = true
isSaved = true
} catch (e) {
if (e instanceof Error) {
Expand Down
1 change: 1 addition & 0 deletions packages/api/src/readability.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ declare module '@omnivore/readability' {
/** Article published date */
publishedDate?: Date | null
language?: string | null
documentElement: HTMLElement
}
}

Expand Down
1 change: 0 additions & 1 deletion packages/api/src/services/save_email.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ export const saveEmail = async (
// can leave this empty for now
},
},
null,
true
)

Expand Down
30 changes: 15 additions & 15 deletions packages/api/src/services/save_page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,13 @@ export const savePage = async (
input: SavePageInput,
user: User
): Promise<SaveResult> => {
const parseResult = await parsePreparedContent(
input.url,
{
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
},
const parseResult = await parsePreparedContent(input.url, {
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
},
input.parseResult
)
})
const [newSlug, croppedPathname] = createSlug(input.url, input.title)
let slug = newSlug
let clientRequestId = input.clientRequestId
Expand Down Expand Up @@ -116,6 +112,7 @@ export const savePage = async (
})
} catch (e) {
return {
__typename: 'SaveError',
errorCodes: [SaveErrorCode.Unknown],
message: 'Failed to create page save request',
}
Expand Down Expand Up @@ -187,11 +184,14 @@ export const savePage = async (
libraryItem: { id: clientRequestId },
}

if (!(await createHighlight(highlight, clientRequestId, user.id))) {
return {
errorCodes: [SaveErrorCode.EmbeddedHighlightFailed],
message: 'Failed to save highlight',
}
try {
await createHighlight(highlight, clientRequestId, user.id)
} catch (error) {
logger.error('Failed to create highlight', {
highlight,
clientRequestId,
userId: user.id,
})
}
}

Expand Down
216 changes: 108 additions & 108 deletions packages/api/src/utils/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ const getReadabilityResult = async (
export const parsePreparedContent = async (
url: string,
preparedDocument: PreparedDocumentInput,
parseResult?: Readability.ParseResult | null,
isNewsletter?: boolean,
allowRetry = true
): Promise<ParsedContentPuppeteer> => {
Expand All @@ -232,12 +231,8 @@ export const parsePreparedContent = async (
labels: { source: 'parsePreparedContent' },
}

// If we have a parse result, use it
let article = parseResult || null
let highlightData = undefined
const { document, pageInfo } = preparedDocument

if (!document) {
const { document: domContent, pageInfo } = preparedDocument
if (!domContent) {
logger.info('No document')
return {
canonicalUrl: url,
Expand All @@ -257,142 +252,147 @@ export const parsePreparedContent = async (
return {
canonicalUrl: url,
parsedContent: null,
domContent: document,
domContent,
pageType: PageType.Unknown,
}
}

let dom: Document | null = null
const { title: pageInfoTitle, canonicalUrl } = pageInfo

let parsedContent: Readability.ParseResult | null = null
let pageType = PageType.Unknown
let highlightData = undefined

try {
dom = parseHTML(document).document
const document = parseHTML(domContent).document
pageType = parseOriginalContent(document)

if (!article) {
// Attempt to parse the article
// preParse content
dom = (await preParseContent(url, dom)) || dom
// Run readability
await preParseContent(url, document)

article = await getReadabilityResult(url, document, dom, isNewsletter)
}
parsedContent = await getReadabilityResult(
url,
domContent,
document,
isNewsletter
)

if (!parsedContent || !parsedContent.content) {
logger.info('No parsed content')

if (allowRetry) {
logger.info('Retrying with content wrapped in html body')

const newDocument = {
...preparedDocument,
document: '<html><body>' + domContent + '</body></html>', // wrap in body
}
return parsePreparedContent(url, newDocument, isNewsletter, false)
}

if (!article?.textContent && allowRetry) {
const newDocument = {
...preparedDocument,
document: '<html><body>' + document + '</body></html>', // wrap in body
return {
canonicalUrl,
parsedContent,
domContent,
pageType,
}
return parsePreparedContent(
url,
newDocument,
parseResult,
isNewsletter,
false
)
}

// use title if not found after running readability
if (!parsedContent.title && pageInfoTitle) {
parsedContent.title = pageInfoTitle
}

const newDocumentElement = parsedContent.documentElement
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
if (article?.content) {
const articleDom = parseHTML(article.content).document
const codeBlocks = articleDom.querySelectorAll(
'code, pre[class^="prism-"], pre[class^="language-"]'
)
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
if (e.textContent) {
const att = hljs.highlightAuto(e.textContent)
const code = articleDom.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
}
})
article.content = articleDom.documentElement.outerHTML
const codeBlocks = newDocumentElement.querySelectorAll(
'pre[class^="prism-"], pre[class^="language-"], code'
)
codeBlocks.forEach((e) => {
if (!e.textContent) {
return e.parentNode?.removeChild(e)
}

highlightData = findEmbeddedHighlight(articleDom.documentElement)

const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]

// Get the top level element?
const pageNode = articleDom.firstElementChild as HTMLElement
const nodesToVisitStack: [HTMLElement] = [pageNode]
const visitedNodeList = []

while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
// replace <br> or <p> or </p> with \n
e.innerHTML = e.innerHTML.replace(/<(br|p|\/p)>/g, '\n')

const att = hljs.highlightAuto(e.textContent)
const code = document.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
})

highlightData = findEmbeddedHighlight(newDocumentElement)

visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]

article.content = articleDom.documentElement.outerHTML
// Get the top level element?
// const pageNode = newDocumentElement.firstElementChild as HTMLElement
const nodesToVisitStack: [HTMLElement] = [newDocumentElement]
const visitedNodeList = []

while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}

visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})

const newHtml = newDocumentElement.outerHTML
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)

Object.assign(article || {}, {
content: clean,
title: article?.title,
previewImage: article?.previewImage,
siteName: article?.siteName,
siteIcon: article?.siteIcon,
byline: article?.byline,
language: article?.language,
})
const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG)
parsedContent.content = cleanHtml

logRecord.parseSuccess = true
} catch (error) {
logger.info('Error parsing content', error)
logger.error('Error parsing content', error)

Object.assign(logRecord, {
parseSuccess: false,
parseError: error,
})
}

const { title, canonicalUrl } = pageInfo

Object.assign(article || {}, {
title: article?.title || title,
})

logger.info('parse-article completed')
logger.info('parse-article completed', logRecord)

return {
domContent: document,
parsedContent: article,
canonicalUrl,
pageType: dom ? parseOriginalContent(dom) : PageType.Unknown,
parsedContent,
domContent,
pageType,
highlightData,
}
}
Expand Down
Loading

1 comment on commit 7f6b722

@vercel
Copy link

@vercel vercel bot commented on 7f6b722 Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.