Skip to content

Commit

Permalink
change file type logic (#1044)
Browse files Browse the repository at this point in the history
* change file type logic

* change delete logic
  • Loading branch information
bwook00 authored Dec 9, 2024
1 parent 5fc4a96 commit e5e2b8b
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
2 changes: 1 addition & 1 deletion autorag/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.11rc2
0.3.11rc3
19 changes: 15 additions & 4 deletions autorag/data/parse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,23 @@ def run_parser(
)
set_file_types = set([module["file_type"] for module in module_params])

# Calculate the set difference once
file_types_to_remove = set_file_types - file_types

# Use list comprehension to filter out unwanted elements
module_params = [
param
for param in module_params
if param["file_type"] not in file_types_to_remove
]
modules = [
module
for module, param in zip(modules, module_params)
if param["file_type"] not in file_types_to_remove
]

# create a list of only those file_types that are in file_types but not in set_file_types
missing_file_types = list(file_types - set_file_types)
if list(set_file_types - file_types):
raise ValueError(
f"File types {list(set_file_types - file_types)} are not in the data path."
)

if missing_file_types:
add_modules_list = []
Expand Down
22 changes: 22 additions & 0 deletions tests/autorag/data/parse/test_parse_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,25 @@ def test_run_parser():
"file_type": "pdf",
}
assert os.path.exists(os.path.join(temp_dir, "pdf.parquet"))


def test_run_parser_two():
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
modules = [langchain_parse, langchain_parse]
module_params = [
{"parse_method": "pdfminer", "file_type": "pdf"},
{"parse_method": "csv", "file_type": "csv"},
]
data_path_glob = eng_text_glob
summary_df = run_parser(
modules, module_params, data_path_glob, temp_dir, all_files=False
)
assert os.path.exists(os.path.join(temp_dir, "summary.csv"))
expect_columns = {"filename", "module_name", "module_params", "execution_time"}
assert set(summary_df.columns) == expect_columns
assert len(summary_df) == 1
assert summary_df["module_params"][0] == {
"parse_method": "pdfminer",
"file_type": "pdf",
}
assert os.path.exists(os.path.join(temp_dir, "pdf.parquet"))

0 comments on commit e5e2b8b

Please sign in to comment.