Skip to content

Commit

Permalink
Update mineru (#306)
Browse files Browse the repository at this point in the history
* update mineru

* update mineru

* update mineru

* fix markdown test case

* fix gpu test cases

* fix gpu test cases

* update mode_dir_name
  • Loading branch information
Ceceliachenen authored Dec 12, 2024
1 parent 7cf5af2 commit 408df90
Show file tree
Hide file tree
Showing 16 changed files with 2,405 additions and 1,963 deletions.
17 changes: 14 additions & 3 deletions magic-pdf.gpu.template.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,21 @@
"bucket-name-1": ["ak", "sk", "endpoint"],
"bucket-name-2": ["ak", "sk", "endpoint"]
},
"models-dir": "/tmp/models",
"models-dir": "model_repository/PDF-Extract-Kit/models",
"layoutreader-model-dir": "model_repository/PDF-Extract-Kit/models/layoutreader",
"device-mode": "cuda",
"layout-config": {
"model": "doclayout_yolo"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"is_table_recog_enable": false,
"model": "rapid_table",
"enable": false,
"max_time": 400
}
},
"config_version": "1.0.0"
}
17 changes: 14 additions & 3 deletions magic-pdf.template.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,21 @@
"bucket-name-1": ["ak", "sk", "endpoint"],
"bucket-name-2": ["ak", "sk", "endpoint"]
},
"models-dir": "/tmp/models",
"models-dir": "model_repository/PDF-Extract-Kit/models",
"layoutreader-model-dir": "model_repository/PDF-Extract-Kit/models/layoutreader",
"device-mode": "cpu",
"layout-config": {
"model": "doclayout_yolo"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"is_table_recog_enable": false,
"model": "rapid_table",
"enable": false,
"max_time": 400
}
},
"config_version": "1.0.0"
}
4,162 changes: 2,264 additions & 1,898 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ torchvision = [
{version = "0.18.0+cpu", source = "pytorch_cpu", markers = "sys_platform != 'darwin'"},
{version = "0.17.2", markers = "sys_platform == 'darwin'"}
]
transformers = "4.40.0"
transformers = "4.42.4"
openpyxl = "^3.1.2"
pdf2image = "^1.17.0"
llama-index-storage-chat-store-redis = "^0.1.3"
Expand Down Expand Up @@ -96,7 +96,7 @@ detectron2 = [
{markers = "sys_platform == 'win32'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.3.0cpu-cp311-cp311-win_amd64.whl"},
{markers = "sys_platform != 'win32' and sys_platform != 'linux' ", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.2.2cpu-cp311-cp311-macosx_10_9_universal2.whl"}
]
magic-pdf = {version = "0.7.0b1", extras = ["full"]}
magic-pdf = {version = "0.10.5", extras = ["full"]}
peft = "^0.12.0"
duckduckgo-search = "6.2.12"
aliyun-bootstrap = "1.0.2"
Expand Down
4 changes: 2 additions & 2 deletions pyproject_gpu.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ asgi-correlation-id = "^4.3.1"
openinference-instrumentation-llama-index = "^2.2.1"
torch = "2.2.2"
torchvision = "0.17.2"
transformers = "4.40.0"
transformers = "4.42.4"
openpyxl = "^3.1.2"
pdf2image = "^1.17.0"
llama-index-storage-chat-store-redis = "^0.1.3"
Expand Down Expand Up @@ -90,7 +90,7 @@ detectron2 = [
{markers = "sys_platform == 'win32'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.3.0cu121-cp311-cp311-win_amd64.whl"},
{markers = "sys_platform != 'win32' and sys_platform != 'linux' ", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.2.2cpu-cp311-cp311-macosx_10_9_universal2.whl"}
]
magic-pdf = {version = "0.7.0b1", extras = ["full"]}
magic-pdf = {version = "0.10.5", extras = ["full"]}
paddlepaddle-gpu = [
{markers = "sys_platform == 'linux'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/paddlepaddle_gpu-3.0.0b1-cp311-cp311-linux_x86_64.whl"},
{markers = "sys_platform != 'linux'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/paddlepaddle_gpu-3.0.0b1-cp311-cp311-win_amd64.whl"}
Expand Down
12 changes: 4 additions & 8 deletions src/pai_rag/integrations/readers/pai_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, List, Optional, Union, Any
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from pai_rag.utils.markdown_utils import (
transform_local_to_oss,
is_horizontal_table,
Expand Down Expand Up @@ -292,7 +292,7 @@ def parse_pdf(
model_json = []

# 执行解析步骤
image_writer = DiskReaderWriter(temp_file_path)
image_writer = FileBasedDataWriter(temp_file_path)

# 选择解析方式
if parse_method == "auto":
Expand All @@ -308,12 +308,8 @@ def parse_pdf(
pipe.pipe_classify()

# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # 解析
else:
logger.error("need model list input")
exit(1)
if len(model_json) == 0:
pipe.pipe_analyze() # 解析

# 执行解析
pipe.pipe_parse()
Expand Down
2 changes: 1 addition & 1 deletion src/pai_rag/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
DEFAULT_BREAKPOINT = 95
DEFAULT_BUFFER_SIZE = 1

EAS_DEFAULT_MODEL_DIR = "/huggingface/pai_rag_model_repository"
EAS_DEFAULT_MODEL_DIR = "/huggingface/pai_rag_model_repository_01"
if not os.path.exists(EAS_DEFAULT_MODEL_DIR):
DEFAULT_MODEL_DIR = "./model_repository"
else:
Expand Down
14 changes: 6 additions & 8 deletions src/pai_rag/utils/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,17 @@ def load_mineru_config(self):
source_path = "magic-pdf.template.json"
destination_path = os.path.expanduser("~/magic-pdf.json") # 目标路径

if os.path.exists(destination_path):
logger.info(
"magic-pdf.json already exists, skip modifying ~/magic-pdf.json."
)
return

# 读取 source_path 文件的内容
with open(source_path, "r") as source_file:
data = json.load(source_file) # 加载 JSON 数据

if "models-dir" in data:
data["models-dir"] = (
str(self.download_directory_path) + "/PDF-Extract-Kit/models"
data["models-dir"] = os.path.join(
str(self.download_directory_path), "PDF-Extract-Kit/models"
)
if "layoutreader-model-dir" in data:
data["layoutreader-model-dir"] = os.path.join(
str(self.download_directory_path), "PDF-Extract-Kit/models/layoutreader"
)

# 将修改后的内容写入destination_path
Expand Down
30 changes: 23 additions & 7 deletions tests/core/test_rag_application.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
import asyncio
import os
from pathlib import Path
from pai_rag.app.api.models import RagQuery
import pytest
import shutil
from pai_rag.core.rag_application import RagApplication, RagChatType
from pai_rag.core.rag_config_manager import RagConfigManager

BASE_DIR = Path(__file__).parent.parent.parent
TEST_INDEX_PATH = "localdata/teststorage"

EXPECTED_EMPTY_RESPONSE = """Empty query. Please input your question."""


pytestmark = pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)


@pytest.fixture(scope="module", autouse=True)
def rag_app():
from pai_rag.core.rag_application import RagApplication
from pai_rag.core.rag_config_manager import RagConfigManager

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()

Expand All @@ -31,7 +37,10 @@ def rag_app():


# Test rag query
def test_query(rag_app: RagApplication):
def test_query(rag_app):
from pai_rag.app.api.models import RagQuery
from pai_rag.core.rag_application import RagChatType

query = RagQuery(question="Why did he decide to learn AI?")
response = asyncio.run(rag_app.aquery(query, chat_type=RagChatType.RAG))
assert len(response.answer) > 10 and response.answer != "Empty Response"
Expand All @@ -42,7 +51,10 @@ def test_query(rag_app: RagApplication):


# Test llm query
def test_llm(rag_app: RagApplication):
def test_llm(rag_app):
from pai_rag.app.api.models import RagQuery
from pai_rag.core.rag_application import RagChatType

query = RagQuery(question="What is the result of 15+22?")
response = asyncio.run(rag_app.aquery(query, chat_type=RagChatType.LLM))
assert "37" in response.answer
Expand All @@ -53,7 +65,9 @@ def test_llm(rag_app: RagApplication):


# Test retrieval query
def test_retrieval(rag_app: RagApplication):
def test_retrieval(rag_app):
from pai_rag.app.api.models import RagQuery

retrieval_query = RagQuery(question="Why did he decide to learn AI?")
response = asyncio.run(rag_app.aretrieve(retrieval_query))
assert len(response.docs) > 0
Expand All @@ -64,7 +78,9 @@ def test_retrieval(rag_app: RagApplication):


# Test agent query
def test_agent(rag_app: RagApplication):
def test_agent(rag_app):
from pai_rag.app.api.models import RagQuery

query = RagQuery(question="What is the result of 15+22?")
response = asyncio.run(rag_app.aquery_agent(query))
assert "37" in response.answer
Expand Down
22 changes: 18 additions & 4 deletions tests/data_readers/test_csv_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_csv_reader import PaiCSVReader, PaiPandasCSVReader
import pytest

BASE_DIR = Path(__file__).parent.parent.parent


pytestmark = pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)


@pytest.fixture(scope="module", autouse=True)
def test_csv_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_csv_reader import PaiCSVReader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand All @@ -28,6 +37,11 @@ def test_csv_reader():


def test_pandas_csv_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_csv_reader import PaiPandasCSVReader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand Down
14 changes: 10 additions & 4 deletions tests/data_readers/test_excel_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader
import pytest

BASE_DIR = Path(__file__).parent.parent.parent


@pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)
def test_pandas_excel_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand Down
14 changes: 10 additions & 4 deletions tests/data_readers/test_html_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader
import pytest

BASE_DIR = Path(__file__).parent.parent.parent


@pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)
def test_pai_html_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand Down
14 changes: 10 additions & 4 deletions tests/data_readers/test_jsonl_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_jsonl_reader import PaiJsonLReader
import pytest

BASE_DIR = Path(__file__).parent.parent.parent


@pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)
def test_jsonl_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_jsonl_reader import PaiJsonLReader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand Down
19 changes: 11 additions & 8 deletions tests/data_readers/test_pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import os
from pathlib import Path

import pytest
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
from pai_rag.utils.download_models import ModelScopeDownloader
from pai_rag.utils.markdown_utils import is_horizontal_table

BASE_DIR = Path(__file__).parent.parent.parent


@pytest.mark.skipif(
pytestmark = pytest.mark.skipif(
os.getenv("SKIP_GPU_TESTS", "false") == "true",
reason="Need to execute in a CUDA environment.",
)


def test_pai_pdf_reader():
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
from pai_rag.utils.download_models import ModelScopeDownloader

config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
Expand All @@ -33,6 +34,8 @@ def test_pai_pdf_reader():


def test_is_horizontal_table():
from pai_rag.utils.markdown_utils import is_horizontal_table

# example data
horizontal_table_1 = [
["Name", "Age", "City"],
Expand Down
Loading

0 comments on commit 408df90

Please sign in to comment.