WIP: Add "Retrieve Metadata" for PDF

zotero · Jan 6, 2025 · 870cfd8 · 870cfd8
1 parent 0a5c364
commit 870cfd8
Show file tree

Hide file tree

Showing 10 changed files with 415 additions and 191 deletions.
diff --git a/scripts/server.cjs b/scripts/server.cjs
@@ -7,6 +7,7 @@ const serveStatic = require('serve-static');
 const checkTrue = env => !!(env && (parseInt(env) || env === true || env === "true"));
 
 const translateURL = process.env.TRANSLATE_URL ?? 'http://localhost:1969';
+const recognizerURL = process.env.RECOGNIZER_URL ?? 'http://localhost:1970';
 const useHTTPS = checkTrue(process.env.USE_HTTPS);
 const htmlFile = checkTrue(process.env.EMBEDDED) ? 'embedded.html' : 'index.html';
 const port = process.env.PORT ?? (useHTTPS ? 8443 : 8001);
@@ -15,7 +16,7 @@ const serve = serveStatic(path.join(__dirname, '..', 'build'), { 'index': false
 const proxy = httpProxy.createProxyServer();
 
 const handler = (req, resp) => {
-	const fallback = () => {
+const fallback = () => {
 		fs.readFile(path.join(__dirname, '..', 'build', htmlFile), (err, buf) => {
 			resp.setHeader('Content-Type', 'text/html');
 			resp.end(buf);
@@ -32,6 +33,17 @@ const handler = (req, resp) => {
 			resp.statusMessage = `Translation Server not available at ${translateURL}: ${err}`;
 			resp.end();
 		});
+	} else if(req.url.startsWith('/recognize')) {
+		proxy.web(req, resp, {
+			changeOrigin: true,
+			target: `${recognizerURL}`,
+			secure: false
+		});
+		proxy.on('error', err => {
+			resp.statusCode = 502;
+			resp.statusMessage = `Recognizer Server not available at ${recognizerURL}: ${err}`;
+			resp.end();
+		});
 	} else {
 		serve(req, resp, fallback);
 	}

diff --git a/src/js/actions/current.js b/src/js/actions/current.js
@@ -3,7 +3,7 @@ import { omit } from 'web-common/utils';
 import { getApiForItems, splitItemAndCollectionKeys } from '../common/actions';
 import { exportItems, chunkedToggleTagsOnItems, chunkedAddToCollection, chunkedCopyToLibrary,
 	chunkedDeleteItems, chunkedMoveItemsToTrash, chunkedRecoverItemsFromTrash,
-	chunkedRemoveFromCollection, chunkedUpdateCollectionsTrash, chunkedDeleteCollections, createItem, createItemOfType, toggleModal, navigate } from '.';
+	chunkedRemoveFromCollection, chunkedUpdateCollectionsTrash, chunkedDeleteCollections, createItem, createItemOfType, toggleModal, navigate, retrieveMetadata } from '.';
 import columnProperties from '../constants/column-properties';
 import { BIBLIOGRAPHY, COLLECTION_SELECT, EXPORT, NEW_ITEM } from '../constants/modals';
 import { TOGGLE_ADD, TOGGLE_REMOVE } from '../common/tags';
@@ -257,6 +257,17 @@ const currentGoToSubscribeUrl = () => {
 	}
 }
 
+const currentRetrieveMetadata = () => {
+	return async (dispatch, getState) => {
+		const state = getState();
+		const { itemKeys: keys, libraryKey } = state.current;
+		const { itemKeys } = splitItemAndCollectionKeys(keys, libraryKey, state);
+
+		const promises = itemKeys.map(key => dispatch(retrieveMetadata(key, libraryKey)));
+		return await Promise.all(promises);
+	}
+}
+
 export {
 	currentAddTags,
 	currentAddToCollection,
@@ -274,6 +285,7 @@ export {
 	currentRecoverFromTrash,
 	currentRemoveColoredTags,
 	currentRemoveItemFromCollection,
+	currentRetrieveMetadata,
 	currentToggleTagByIndex,
 	currentMoveToTrash,
 	currentTrashOrDelete,

diff --git a/src/js/actions/identifier.js b/src/js/actions/identifier.js
@@ -7,7 +7,7 @@ import { BEGIN_SEARCH_MULTIPLE_IDENTIFIERS, COMPLETE_SEARCH_MULTIPLE_IDENTIFIERS
 	RECEIVE_IDENTIFIER_MORE, ERROR_IDENTIFIER_MORE } from '../constants/actions';
 import { createItem, createItems, navigate } from '.';
 import { extractIdentifiers } from '../common/identifiers';
-import { EMPTY, SINGLE, CHOICE , CHOICE_EXHAUSTED, MULTIPLE } from '../constants/identifier-result-types';
+import { EMPTY, CHOICE , CHOICE_EXHAUSTED, MULTIPLE } from '../constants/identifier-result-types';
 
 const getNextLinkFromResponse = response => {
 	let next = null;
@@ -102,33 +102,16 @@ const searchIdentifier = (identifier, { shouldImport = false } = {}) => {
 					const message = 'Zotero could not find any identifiers in your input. Please verify your input and try again.';
 					dispatch({ type: RECEIVE_ADD_BY_IDENTIFIER, identifier, identifierIsUrl, result: EMPTY, message, import: shouldImport });
 				} else {
-					if(json.length > 0) {
-						dispatch({
-							type: RECEIVE_ADD_BY_IDENTIFIER,
-							result: MULTIPLE,
-							items: json,
-							identifierIsUrl,
-							identifier,
-							import: shouldImport,
-							response
-						});
-						return json;
-					} else {
-						const item = json[0];
-						delete item.key;
-						delete item.version;
-
-						dispatch({
-							type: RECEIVE_ADD_BY_IDENTIFIER,
-							result: SINGLE,
-							item,
-							identifier,
-							identifierIsUrl,
-							import: shouldImport,
-							response
-						});
-						return item;
-					}
+					dispatch({
+						type: RECEIVE_ADD_BY_IDENTIFIER,
+						result: MULTIPLE,
+						items: json,
+						identifierIsUrl,
+						identifier,
+						import: shouldImport,
+						response
+					});
+					return json;
 				}
 			}
 		} catch(error) {
@@ -174,7 +157,7 @@ const currentAddMultipleTranslatedItems = identifiers => {
 				const promises = identifiers.map(identifier => fetch(url, {
 					method: 'post',
 					mode: 'cors',
-					headers: { 'content-type': 'text/plain' },
+					headers: { 'Content-Type': 'text/plain' },
 					body: identifier
 				}).then(async r => (await r.json())[0]).catch(() => null));
 
@@ -266,7 +249,7 @@ const searchIdentifierMore = () => {
 			const response = await fetch(next, {
 				method: 'post',
 				mode: 'cors',
-				headers: { 'content-type': 'text/plain' },
+				headers: { 'Content-Type': 'text/plain' },
 				body: identifier
 			});
 

diff --git a/src/js/actions/index.js b/src/js/actions/index.js
@@ -13,6 +13,7 @@ export * from "./library";
 export * from "./meta";
 export * from "./navigate";
 export * from "./preferences";
+export * from './recognize';
 export * from "./styles";
 export * from "./tags";
 export * from "./triggers";

diff --git a/src/js/actions/items-write.js b/src/js/actions/items-write.js
@@ -354,9 +354,11 @@ const queueDeleteItems = (itemKeys, libraryKey, id, resolve, reject) => {
 	};
 }
 
-const updateItem = (itemKey, patch) => {
+const updateItem = (itemKey, patch, libraryKey) => {
 	return async (dispatch, getState) => {
-		const { libraryKey } = getState().current;
+		if (libraryKey === undefined) {
+			libraryKey = getState().current.libraryKey;
+		}
 		const id = requestTracker.id++;
 
 		dispatch({

diff --git a/src/js/actions/recognize.js b/src/js/actions/recognize.js
@@ -0,0 +1,165 @@
+import { createItem, getAttachmentUrl, updateItem } from '.';
+import { PDFWorker } from '../common/pdf-worker.js';
+import { pick } from 'web-common/utils';
+
+// TODO: This and `searchIdentifier` should be merged (but we don't want to dispatch idenfitier actions here)
+const getItemFromIdentifier = identifier => {
+	return async (dispatch, getState) => {
+		const state = getState();
+		const { translateUrl } = state.config;
+		const url = `${translateUrl}/search`;
+		const response = await fetch(url, {
+			method: 'POST',
+			mode: 'cors',
+			headers: { 'Content-Type': 'text/plain', },
+			body: identifier
+		});
+		if (response.ok) {
+			const translatorResponse = await response.json();
+			return translatorResponse?.[0]
+		} else {
+			//TODO
+			throw new Error('Failed to get item from identifier');
+		}
+	}
+}
+
+const retrieveMetadata = (itemKey, libraryKey) => {
+	return async (dispatch, getState) => {
+		const state = getState();
+		const attachmentItem = state.libraries[state.current.libraryKey]?.items?.[itemKey];
+		const recognizerData = await dispatch(getRecognizerData(itemKey));
+		const recognizedItem = await dispatch(recognizePDF(recognizerData));
+		delete recognizedItem.key;
+		delete recognizedItem.version;
+		recognizedItem.collections = [...attachmentItem.collections];
+		const item = await dispatch(createItem(recognizedItem, libraryKey));
+		await dispatch(updateItem(itemKey, { parentItem: item.key }, libraryKey));
+	}
+}
+
+
+// extract metadata from the PDF and send it to the recognizer server
+const getRecognizerData = itemKey => {
+	return async (dispatch, getState) => {
+		try {
+			const state = getState();
+			const attachmentItem = state.libraries[state.current.libraryKey]?.items?.[itemKey];
+			if (attachmentItem.contentType !== 'application/pdf') {
+				//TODO
+				throw new Error("Attachment is not a PDF");
+			}
+
+			const attachmentURL = await dispatch(getAttachmentUrl(itemKey));
+			const data = await (await fetch(attachmentURL)).arrayBuffer();
+			const { pdfWorkerURL, pdfReaderCMapsURL, pdfReaderStandardFontsURL, recognizerUrl } = state.config;
+			const pdfWorker = new PDFWorker({ pdfWorkerURL, pdfReaderCMapsURL, pdfReaderStandardFontsURL });
+			const recognizerInputData = await pdfWorker.getRecognizerData(data); // TODO: add suport for password-protected PDFs
+			recognizerInputData.fileName = attachmentItem.filename;
+
+			const containingTextPages = recognizerInputData.pages.reduce((acc, page) => {
+				if (page?.[2]?.length) {
+					acc++;
+				}
+				return acc;
+			}, 0);
+
+			if (containingTextPages === 0) {
+				// TODO
+				throw new Error('PDF does not contain any text');
+			}
+
+			const url = `${recognizerUrl}/recognize`;
+			const response = await fetch(url, {
+				method: 'POST',
+				mode: 'cors',
+				headers: { 'content-type': 'application/json', },
+				body: JSON.stringify(recognizerInputData)
+			});
+			if (response.ok) {
+				return await response.json();
+			} else {
+				//TODO
+				throw new Error('Failed to recognize document');
+			}
+		} catch (error) {
+			console.error(error);
+		}
+	}
+}
+
+
+// create item based on data returned from recognizer
+// based on https://github.com/zotero/zotero/blob/5fd94e22dff87318aa3a84e735e1fdece488f5e3/chrome/content/zotero/xpcom/recognizeDocument.js#L411
+const recognizePDF = (recognizerData) => {
+	return async (dispatch) => {
+		let identifierPrefix = '';
+		let idenfitierValue = '';
+		if (recognizerData.arxiv) {
+			identifierPrefix = 'arxiv';
+			idenfitierValue = recognizerData.arxiv;
+		} else if (recognizerData.doi) {
+			identifierPrefix = 'DOI';
+			idenfitierValue = recognizerData.doi;
+		} else if (recognizerData.isbn) {
+			identifierPrefix = 'ISBN';
+			idenfitierValue = recognizerData.isbn;
+		}
+
+		if (identifierPrefix && idenfitierValue) {
+			try {
+				const translatedItem = await dispatch(getItemFromIdentifier(`${identifierPrefix}: ${idenfitierValue}`));
+				if (translatedItem) {
+					if (!translatedItem.abstractNote && recognizerData.abstract) {
+						translatedItem.abstractNote = recognizerData.abstract;
+					}
+					if (!translatedItem.language && recognizerData.language) {
+						translatedItem.language = recognizerData.language;
+					}
+					if (translatedItem.tags) {
+						translatedItem.tags = translatedItem.tags.map(tag => {
+							if (typeof tag === 'string') {
+								return { tag, type: 1 };
+							}
+							tag.type = 1;
+							return tag;
+						});
+					}
+					return translatedItem;
+				}
+			} catch (e) {
+				//TODO
+				console.error('RecognizeDocument: ' + e);
+			}
+		}
+
+		// no identifier found
+		if (recognizerData.title) {
+			let type = 'journalArticle';
+
+			if (recognizerData.type === 'book-chapter') {
+				type = 'bookSection';
+			}
+
+			const newItem = {
+				type,
+				creators: recognizerData.authors.map(author => ({
+					creatorType: 'author', ...pick(author, ['firstName', 'lastName'])
+				})),
+				title: recognizerData.title,
+				abstractNote: recognizerData.abstract,
+				date: recognizerData.year,
+				libraryCatalog: 'Zotero',
+				...pick(recognizerData, ['pages', 'volume', 'url', 'language']),
+				...(type === 'journalArticle' ? { issue: recognizerData.issue, issn: recognizerData.ISSN, publicationTitle: recognizerData.container } : {}),
+				...(type === 'bookSection' ? { bookTitle: recognizerData.container, publisher: recognizerData.publisher } : {}),
+			};
+
+			return newItem;
+		}
+
+		return null;
+	}
+}
+
+export { retrieveMetadata };
diff --git a/src/js/common/pdf-worker.js b/src/js/common/pdf-worker.js
@@ -216,4 +216,32 @@ export class PDFWorker {
 			return modifiedBuf;
 		}, isPriority);
 	}
+
+	/**
+	 * Get data for recognizer-server
+	 *
+	 * @param {ArrayBuffer} buf PDF file
+	 * @param {Boolean} [isPriority]
+	 * @param {String} [password]
+	 * @returns {Promise}
+	 */
+	async getRecognizerData(buf, isPriority, password) {
+		return this._enqueue(async () => {
+			try {
+				var result = await this._query('getRecognizerData', { buf, password }, [buf]);
+			}
+			catch (e) {
+				let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
+				try {
+					error.name = JSON.parse(e.message).name;
+				}
+				catch (e) {
+					console.log(e);
+				}
+				console.log(error);
+				throw error;
+			}
+			return result;
+		}, isPriority);
+	}
 }