change parse_result parquet file name at all_files set True

Marker-Inc-Korea · Dec 9, 2024 · 7867a19 · 7867a19
1 parent e5e2b8b
commit 7867a19
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 9 deletions.
diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
@@ -95,11 +95,11 @@ def run_parser(
 
 	# save results to parquet files
 	if all_files:
-		filepaths = list(
-			map(
-				lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))
+		if len(module_params) > 1:
+			raise ValueError(
+				"All files is set to True, You can only use one parsing module."
 			)
-		)
+		filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
 	else:
 		filepaths = list(
 			map(

diff --git a/docs/source/data_creation/parse/parse.md b/docs/source/data_creation/parse/parse.md
@@ -154,8 +154,11 @@ If the parsing is completed successfully, the following three types of files are
 
 #### Use all files
 
-For example, if parsing is performed using three parse methods, the following files are created.
-`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv`
+You can use only one parse method at a time.
+
+Parsed result will be saved as `parsed_result.parquet`.
+
+If you want to use two all_files parse method, you should run the parse pipeline twice with different two YAML files.
 
 Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it.
 

diff --git a/sample_config/parse/all_files_full.yaml b/sample_config/parse/all_files_full.yaml
@@ -1,10 +1,22 @@
+# You can use only one of the following modules at a time.
 modules:
+  # Use Directory Parse
   - module_type: langchain_parse
     file_type: all_files
-    parse_method: [ directory, unstructured, upstagedocumentparse ]
+    parse_method: directory
+  # Use Unstructured
+  - module_type: langchain_parse
+    file_type: all_files
+    parse_method: unstructured
+  # Use Upsatge Document Parse
+  - module_type: langchain_parse
+    file_type: all_files
+    parse_method: upstagedocumentparse
+  # Use Naver Clova OCR
   - module_type: clova
     file_type: all_files
     table_detection: true
+  # Use Llama Parse
   - module_type: llamaparse
     file_type: all_files
     result_type: markdown

diff --git a/tests/autorag/test_parser.py b/tests/autorag/test_parser.py
@@ -157,8 +157,10 @@ def test_start_parsing_all_files(simple_parser):
 	project_dir = simple_parser.project_dir
 	assert os.path.exists(project_dir)
 	assert os.path.exists(os.path.join(project_dir, "parse_config.yaml"))
-	assert os.path.exists(os.path.join(project_dir, "0.parquet"))
-	all_files_result = pd.read_parquet(os.path.join(project_dir, "0.parquet"))
+	assert os.path.exists(os.path.join(project_dir, "parsed_result.parquet"))
+	all_files_result = pd.read_parquet(
+		os.path.join(project_dir, "parsed_result.parquet")
+	)
 
 	expect_result_columns = ["texts", "path", "page", "last_modified_datetime"]
 	assert all(