Skip to content

Commit

Permalink
WIP: Add "Retrieve Metadata" for PDF
Browse files Browse the repository at this point in the history
  • Loading branch information
tnajdek committed Jan 6, 2025
1 parent 0a5c364 commit 870cfd8
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 191 deletions.
14 changes: 13 additions & 1 deletion scripts/server.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const serveStatic = require('serve-static');
const checkTrue = env => !!(env && (parseInt(env) || env === true || env === "true"));

const translateURL = process.env.TRANSLATE_URL ?? 'http://localhost:1969';
const recognizerURL = process.env.RECOGNIZER_URL ?? 'http://localhost:1970';
const useHTTPS = checkTrue(process.env.USE_HTTPS);
const htmlFile = checkTrue(process.env.EMBEDDED) ? 'embedded.html' : 'index.html';
const port = process.env.PORT ?? (useHTTPS ? 8443 : 8001);
Expand All @@ -15,7 +16,7 @@ const serve = serveStatic(path.join(__dirname, '..', 'build'), { 'index': false
const proxy = httpProxy.createProxyServer();

const handler = (req, resp) => {
const fallback = () => {
const fallback = () => {
fs.readFile(path.join(__dirname, '..', 'build', htmlFile), (err, buf) => {
resp.setHeader('Content-Type', 'text/html');
resp.end(buf);
Expand All @@ -32,6 +33,17 @@ const handler = (req, resp) => {
resp.statusMessage = `Translation Server not available at ${translateURL}: ${err}`;
resp.end();
});
} else if(req.url.startsWith('/recognize')) {
proxy.web(req, resp, {
changeOrigin: true,
target: `${recognizerURL}`,
secure: false
});
proxy.on('error', err => {
resp.statusCode = 502;
resp.statusMessage = `Recognizer Server not available at ${recognizerURL}: ${err}`;
resp.end();
});
} else {
serve(req, resp, fallback);
}
Expand Down
14 changes: 13 additions & 1 deletion src/js/actions/current.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { omit } from 'web-common/utils';
import { getApiForItems, splitItemAndCollectionKeys } from '../common/actions';
import { exportItems, chunkedToggleTagsOnItems, chunkedAddToCollection, chunkedCopyToLibrary,
chunkedDeleteItems, chunkedMoveItemsToTrash, chunkedRecoverItemsFromTrash,
chunkedRemoveFromCollection, chunkedUpdateCollectionsTrash, chunkedDeleteCollections, createItem, createItemOfType, toggleModal, navigate } from '.';
chunkedRemoveFromCollection, chunkedUpdateCollectionsTrash, chunkedDeleteCollections, createItem, createItemOfType, toggleModal, navigate, retrieveMetadata } from '.';
import columnProperties from '../constants/column-properties';
import { BIBLIOGRAPHY, COLLECTION_SELECT, EXPORT, NEW_ITEM } from '../constants/modals';
import { TOGGLE_ADD, TOGGLE_REMOVE } from '../common/tags';
Expand Down Expand Up @@ -257,6 +257,17 @@ const currentGoToSubscribeUrl = () => {
}
}

const currentRetrieveMetadata = () => {
return async (dispatch, getState) => {
const state = getState();
const { itemKeys: keys, libraryKey } = state.current;
const { itemKeys } = splitItemAndCollectionKeys(keys, libraryKey, state);

const promises = itemKeys.map(key => dispatch(retrieveMetadata(key, libraryKey)));
return await Promise.all(promises);
}
}

export {
currentAddTags,
currentAddToCollection,
Expand All @@ -274,6 +285,7 @@ export {
currentRecoverFromTrash,
currentRemoveColoredTags,
currentRemoveItemFromCollection,
currentRetrieveMetadata,
currentToggleTagByIndex,
currentMoveToTrash,
currentTrashOrDelete,
Expand Down
43 changes: 13 additions & 30 deletions src/js/actions/identifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { BEGIN_SEARCH_MULTIPLE_IDENTIFIERS, COMPLETE_SEARCH_MULTIPLE_IDENTIFIERS
RECEIVE_IDENTIFIER_MORE, ERROR_IDENTIFIER_MORE } from '../constants/actions';
import { createItem, createItems, navigate } from '.';
import { extractIdentifiers } from '../common/identifiers';
import { EMPTY, SINGLE, CHOICE , CHOICE_EXHAUSTED, MULTIPLE } from '../constants/identifier-result-types';
import { EMPTY, CHOICE , CHOICE_EXHAUSTED, MULTIPLE } from '../constants/identifier-result-types';

const getNextLinkFromResponse = response => {
let next = null;
Expand Down Expand Up @@ -102,33 +102,16 @@ const searchIdentifier = (identifier, { shouldImport = false } = {}) => {
const message = 'Zotero could not find any identifiers in your input. Please verify your input and try again.';
dispatch({ type: RECEIVE_ADD_BY_IDENTIFIER, identifier, identifierIsUrl, result: EMPTY, message, import: shouldImport });
} else {
if(json.length > 0) {
dispatch({
type: RECEIVE_ADD_BY_IDENTIFIER,
result: MULTIPLE,
items: json,
identifierIsUrl,
identifier,
import: shouldImport,
response
});
return json;
} else {
const item = json[0];
delete item.key;
delete item.version;

dispatch({
type: RECEIVE_ADD_BY_IDENTIFIER,
result: SINGLE,
item,
identifier,
identifierIsUrl,
import: shouldImport,
response
});
return item;
}
dispatch({
type: RECEIVE_ADD_BY_IDENTIFIER,
result: MULTIPLE,
items: json,
identifierIsUrl,
identifier,
import: shouldImport,
response
});
return json;
}
}
} catch(error) {
Expand Down Expand Up @@ -174,7 +157,7 @@ const currentAddMultipleTranslatedItems = identifiers => {
const promises = identifiers.map(identifier => fetch(url, {
method: 'post',
mode: 'cors',
headers: { 'content-type': 'text/plain' },
headers: { 'Content-Type': 'text/plain' },
body: identifier
}).then(async r => (await r.json())[0]).catch(() => null));

Expand Down Expand Up @@ -266,7 +249,7 @@ const searchIdentifierMore = () => {
const response = await fetch(next, {
method: 'post',
mode: 'cors',
headers: { 'content-type': 'text/plain' },
headers: { 'Content-Type': 'text/plain' },
body: identifier
});

Expand Down
1 change: 1 addition & 0 deletions src/js/actions/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export * from "./library";
export * from "./meta";
export * from "./navigate";
export * from "./preferences";
export * from './recognize';
export * from "./styles";
export * from "./tags";
export * from "./triggers";
Expand Down
6 changes: 4 additions & 2 deletions src/js/actions/items-write.js
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,11 @@ const queueDeleteItems = (itemKeys, libraryKey, id, resolve, reject) => {
};
}

const updateItem = (itemKey, patch) => {
const updateItem = (itemKey, patch, libraryKey) => {
return async (dispatch, getState) => {
const { libraryKey } = getState().current;
if (libraryKey === undefined) {
libraryKey = getState().current.libraryKey;
}
const id = requestTracker.id++;

dispatch({
Expand Down
165 changes: 165 additions & 0 deletions src/js/actions/recognize.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import { createItem, getAttachmentUrl, updateItem } from '.';
import { PDFWorker } from '../common/pdf-worker.js';
import { pick } from 'web-common/utils';

// TODO: This and `searchIdentifier` should be merged (but we don't want to dispatch idenfitier actions here)
const getItemFromIdentifier = identifier => {
return async (dispatch, getState) => {
const state = getState();
const { translateUrl } = state.config;
const url = `${translateUrl}/search`;
const response = await fetch(url, {
method: 'POST',
mode: 'cors',
headers: { 'Content-Type': 'text/plain', },
body: identifier
});
if (response.ok) {
const translatorResponse = await response.json();
return translatorResponse?.[0]
} else {
//TODO
throw new Error('Failed to get item from identifier');
}
}
}

const retrieveMetadata = (itemKey, libraryKey) => {
return async (dispatch, getState) => {
const state = getState();
const attachmentItem = state.libraries[state.current.libraryKey]?.items?.[itemKey];
const recognizerData = await dispatch(getRecognizerData(itemKey));
const recognizedItem = await dispatch(recognizePDF(recognizerData));
delete recognizedItem.key;
delete recognizedItem.version;
recognizedItem.collections = [...attachmentItem.collections];
const item = await dispatch(createItem(recognizedItem, libraryKey));
await dispatch(updateItem(itemKey, { parentItem: item.key }, libraryKey));
}
}


// extract metadata from the PDF and send it to the recognizer server
const getRecognizerData = itemKey => {
return async (dispatch, getState) => {
try {
const state = getState();
const attachmentItem = state.libraries[state.current.libraryKey]?.items?.[itemKey];
if (attachmentItem.contentType !== 'application/pdf') {
//TODO
throw new Error("Attachment is not a PDF");
}

const attachmentURL = await dispatch(getAttachmentUrl(itemKey));
const data = await (await fetch(attachmentURL)).arrayBuffer();
const { pdfWorkerURL, pdfReaderCMapsURL, pdfReaderStandardFontsURL, recognizerUrl } = state.config;
const pdfWorker = new PDFWorker({ pdfWorkerURL, pdfReaderCMapsURL, pdfReaderStandardFontsURL });
const recognizerInputData = await pdfWorker.getRecognizerData(data); // TODO: add suport for password-protected PDFs
recognizerInputData.fileName = attachmentItem.filename;

const containingTextPages = recognizerInputData.pages.reduce((acc, page) => {
if (page?.[2]?.length) {
acc++;
}
return acc;
}, 0);

if (containingTextPages === 0) {
// TODO
throw new Error('PDF does not contain any text');
}

const url = `${recognizerUrl}/recognize`;
const response = await fetch(url, {
method: 'POST',
mode: 'cors',
headers: { 'content-type': 'application/json', },
body: JSON.stringify(recognizerInputData)
});
if (response.ok) {
return await response.json();
} else {
//TODO
throw new Error('Failed to recognize document');
}
} catch (error) {
console.error(error);
}
}
}


// create item based on data returned from recognizer
// based on https://github.com/zotero/zotero/blob/5fd94e22dff87318aa3a84e735e1fdece488f5e3/chrome/content/zotero/xpcom/recognizeDocument.js#L411
const recognizePDF = (recognizerData) => {
return async (dispatch) => {
let identifierPrefix = '';
let idenfitierValue = '';
if (recognizerData.arxiv) {
identifierPrefix = 'arxiv';
idenfitierValue = recognizerData.arxiv;
} else if (recognizerData.doi) {
identifierPrefix = 'DOI';
idenfitierValue = recognizerData.doi;
} else if (recognizerData.isbn) {
identifierPrefix = 'ISBN';
idenfitierValue = recognizerData.isbn;
}

if (identifierPrefix && idenfitierValue) {
try {
const translatedItem = await dispatch(getItemFromIdentifier(`${identifierPrefix}: ${idenfitierValue}`));
if (translatedItem) {
if (!translatedItem.abstractNote && recognizerData.abstract) {
translatedItem.abstractNote = recognizerData.abstract;
}
if (!translatedItem.language && recognizerData.language) {
translatedItem.language = recognizerData.language;
}
if (translatedItem.tags) {
translatedItem.tags = translatedItem.tags.map(tag => {
if (typeof tag === 'string') {
return { tag, type: 1 };
}
tag.type = 1;
return tag;
});
}
return translatedItem;
}
} catch (e) {
//TODO
console.error('RecognizeDocument: ' + e);
}
}

// no identifier found
if (recognizerData.title) {
let type = 'journalArticle';

if (recognizerData.type === 'book-chapter') {
type = 'bookSection';
}

const newItem = {
type,
creators: recognizerData.authors.map(author => ({
creatorType: 'author', ...pick(author, ['firstName', 'lastName'])
})),
title: recognizerData.title,
abstractNote: recognizerData.abstract,
date: recognizerData.year,
libraryCatalog: 'Zotero',
...pick(recognizerData, ['pages', 'volume', 'url', 'language']),
...(type === 'journalArticle' ? { issue: recognizerData.issue, issn: recognizerData.ISSN, publicationTitle: recognizerData.container } : {}),
...(type === 'bookSection' ? { bookTitle: recognizerData.container, publisher: recognizerData.publisher } : {}),
};

return newItem;
}

return null;
}
}

export { retrieveMetadata };
28 changes: 28 additions & 0 deletions src/js/common/pdf-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -216,4 +216,32 @@ export class PDFWorker {
return modifiedBuf;
}, isPriority);
}

/**
* Get data for recognizer-server
*
* @param {ArrayBuffer} buf PDF file
* @param {Boolean} [isPriority]
* @param {String} [password]
* @returns {Promise}
*/
async getRecognizerData(buf, isPriority, password) {
return this._enqueue(async () => {
try {
var result = await this._query('getRecognizerData', { buf, password }, [buf]);
}
catch (e) {
let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
try {
error.name = JSON.parse(e.message).name;
}
catch (e) {
console.log(e);
}
console.log(error);
throw error;
}
return result;
}, isPriority);
}
}
Loading

0 comments on commit 870cfd8

Please sign in to comment.