From 96edf57d3d3eb2563cf2d7496ea4aa95da809944 Mon Sep 17 00:00:00 2001 From: ranxia Date: Mon, 20 Jan 2025 17:56:02 +0800 Subject: [PATCH 1/2] add image mimetypes --- .../index/pai/multimodal/multimodal_retriever.py | 12 ++++++++++-- .../nodeparsers/pai/pai_markdown_parser.py | 3 +++ .../integrations/nodeparsers/pai/pai_node_parser.py | 1 - src/pai_rag/integrations/readers/pai_image_reader.py | 6 +++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/pai_rag/integrations/index/pai/multimodal/multimodal_retriever.py b/src/pai_rag/integrations/index/pai/multimodal/multimodal_retriever.py index 6a9b4c37..1d6134b8 100644 --- a/src/pai_rag/integrations/index/pai/multimodal/multimodal_retriever.py +++ b/src/pai_rag/integrations/index/pai/multimodal/multimodal_retriever.py @@ -34,6 +34,7 @@ from pai_rag.integrations.index.pai.local.local_bm25_index import LocalBm25IndexStore from loguru import logger import llama_index.core.instrumentation as instrument +from pai_rag.utils.constants import DEFAULT_IMAGE_MIMETYPE dispatcher = instrument.get_dispatcher(__name__) @@ -200,7 +201,10 @@ def _retrieve( if image_url and image_url not in seen_image_urls: integrated_image_nodes.append( NodeWithScore( - node=ImageNode(image_url=image_url), + node=ImageNode( + image_url=image_url, + image_mimetype=DEFAULT_IMAGE_MIMETYPE, + ), score=node.score, ) ) @@ -447,6 +451,7 @@ def _build_node_list_from_query_result( node = ImageNode( id_=node.id_, image_url=node.metadata.get("image_url"), + image_mimetype=DEFAULT_IMAGE_MIMETYPE, metadata=node.metadata, ) query_result.nodes[i] = node @@ -513,7 +518,10 @@ async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: if image_url and image_url not in seen_image_urls: integrated_image_nodes.append( NodeWithScore( - node=ImageNode(image_url=image_url), + node=ImageNode( + image_url=image_url, + image_mimetype=DEFAULT_IMAGE_MIMETYPE, + ), score=node.score, ) ) diff --git a/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py b/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py index be10978d..9462981b 100644 --- a/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py +++ b/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py @@ -16,6 +16,7 @@ build_markdown_tree, TreeNode, ) +from pai_rag.utils.constants import DEFAULT_IMAGE_MIMETYPE class ImageInfo(BaseModel): @@ -74,6 +75,7 @@ def _format_tree_nodes( image_node = ImageNode( embedding=doc_node.embedding, image_url=node.content, + image_mimetype=DEFAULT_IMAGE_MIMETYPE, excluded_embed_metadata_keys=doc_node.excluded_embed_metadata_keys, excluded_llm_metadata_keys=doc_node.excluded_llm_metadata_keys, metadata_separator=doc_node.metadata_separator, @@ -235,6 +237,7 @@ def traverse_tree( image_node = ImageNode( embedding=doc_node.embedding, image_url=child.content, + image_mimetype=DEFAULT_IMAGE_MIMETYPE, excluded_embed_metadata_keys=doc_node.excluded_embed_metadata_keys, excluded_llm_metadata_keys=doc_node.excluded_llm_metadata_keys, metadata_separator=doc_node.metadata_separator, diff --git a/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py b/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py index dd3abb67..d2cece77 100644 --- a/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py +++ b/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py @@ -158,7 +158,6 @@ def get_nodes_from_documents( text=doc_node.text, metadata=doc_node.metadata, image_url=doc_node.image_url, - image_path=doc_node.image_path, image_mimetype=doc_node.image_mimetype, ) ) diff --git a/src/pai_rag/integrations/readers/pai_image_reader.py b/src/pai_rag/integrations/readers/pai_image_reader.py index dec26308..12cc7ccc 100644 --- a/src/pai_rag/integrations/readers/pai_image_reader.py +++ b/src/pai_rag/integrations/readers/pai_image_reader.py @@ -46,11 +46,15 @@ def load_data( }, # set public read to make image accessible path_prefix="pairag/images/", ) + extension = file_path.suffix.replace(".", "") + mimetypes = f"image/{extension}" extra_info["file_path"] = str(file_path) extra_info["file_name"] = os.path.basename(file_path) extra_info["image_url"] = image_url - image_doc = ImageDocument(image_url=image_url, extra_info=extra_info) + image_doc = ImageDocument( + image_url=image_url, image_mimetype=mimetypes, extra_info=extra_info + ) docs = [image_doc] # docs = self.load_image_urls([image_url], extra_info=extra_info) return docs From 70b4b242cb545ad906eefade7da09cfaaf1f0c35 Mon Sep 17 00:00:00 2001 From: ranxia Date: Mon, 20 Jan 2025 19:18:53 +0800 Subject: [PATCH 2/2] fix image type --- src/pai_rag/utils/constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pai_rag/utils/constants.py b/src/pai_rag/utils/constants.py index daf92f6f..1b316c46 100644 --- a/src/pai_rag/utils/constants.py +++ b/src/pai_rag/utils/constants.py @@ -23,3 +23,5 @@ DEFAULT_DATAFILE_DIR = "./data" DEFAULT_DASHSCOPE_EMBEDDING_MODEL = "text-embedding-v2" + +DEFAULT_IMAGE_MIMETYPE = "image/jpeg"