Skip to content

Commit

Permalink
change parse_result parquet file name at all_files set True
Browse files Browse the repository at this point in the history
  • Loading branch information
bwook00 committed Dec 9, 2024
1 parent e5e2b8b commit 7867a19
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 9 deletions.
8 changes: 4 additions & 4 deletions autorag/data/parse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ def run_parser(

# save results to parquet files
if all_files:
filepaths = list(
map(
lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))
if len(module_params) > 1:
raise ValueError(
"All files is set to True, You can only use one parsing module."
)
)
filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
else:
filepaths = list(
map(
Expand Down
7 changes: 5 additions & 2 deletions docs/source/data_creation/parse/parse.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,11 @@ If the parsing is completed successfully, the following three types of files are

#### Use all files

For example, if parsing is performed using three parse methods, the following files are created.
`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv`
You can use only one parse method at a time.

Parsed result will be saved as `parsed_result.parquet`.

If you want to use two all_files parse method, you should run the parse pipeline twice with different two YAML files.

Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it.

Expand Down
14 changes: 13 additions & 1 deletion sample_config/parse/all_files_full.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
# You can use only one of the following modules at a time.
modules:
# Use Directory Parse
- module_type: langchain_parse
file_type: all_files
parse_method: [ directory, unstructured, upstagedocumentparse ]
parse_method: directory
# Use Unstructured
- module_type: langchain_parse
file_type: all_files
parse_method: unstructured
# Use Upsatge Document Parse
- module_type: langchain_parse
file_type: all_files
parse_method: upstagedocumentparse
# Use Naver Clova OCR
- module_type: clova
file_type: all_files
table_detection: true
# Use Llama Parse
- module_type: llamaparse
file_type: all_files
result_type: markdown
Expand Down
6 changes: 4 additions & 2 deletions tests/autorag/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,10 @@ def test_start_parsing_all_files(simple_parser):
project_dir = simple_parser.project_dir
assert os.path.exists(project_dir)
assert os.path.exists(os.path.join(project_dir, "parse_config.yaml"))
assert os.path.exists(os.path.join(project_dir, "0.parquet"))
all_files_result = pd.read_parquet(os.path.join(project_dir, "0.parquet"))
assert os.path.exists(os.path.join(project_dir, "parsed_result.parquet"))
all_files_result = pd.read_parquet(
os.path.join(project_dir, "parsed_result.parquet")
)

expect_result_columns = ["texts", "path", "page", "last_modified_datetime"]
assert all(
Expand Down

0 comments on commit 7867a19

Please sign in to comment.