Skip to content

Commit

Permalink
Merge pull request #41 from Filimoa/llama-index-node-compatability
Browse files Browse the repository at this point in the history
Llama Index Integration
  • Loading branch information
Filimoa authored May 2, 2024
2 parents 6f090d4 + 7faea1d commit 7c22265
Show file tree
Hide file tree
Showing 13 changed files with 443 additions and 34 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ test-output.pdf
notebooks/
sample-docs/
weights/
.env

29 changes: 29 additions & 0 deletions docs/integrations.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Llama Index

We have a simple integration with Llama Index. You can convert the parsed document to Llama Index nodes and then create an index from those nodes.

```py
import openparse
from llama_index.core import VectorStoreIndex

doc_path = "./sample-docs/lyft-10k.pdf"
parser = openparse.DocumentParser()
parsed_doc = parser.parse(doc_path)

nodes = parsed_doc.to_llama_index_nodes()
index = VectorStoreIndex(nodes=nodes)
```

Now you can query the index

```py
query_engine = index.as_query_engine()
response = query_engine.query("What do they do to make money?")
print(response)
```

You can also add nodes to an existing index

```py
existing_index.insert_nodes(nodes)
```
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ nav:
- Advanced:
- Customization: processing/customization.md
- Serializing Results: serialization.md
- Integrations: integrations.md
- Visualization: visualization.md
- Config: config.md

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's."
readme = "README.md"
requires-python = ">=3.8"
license = "MIT"
version = "0.5.5"
version = "0.5.6"
authors = [{name = "Sergey Filimonov", email = "[email protected]"}]
dependencies = [
"PyMuPDF >= 1.23.2",
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ beautifulsoup4
twine
packaging
wheel
llama-index
225 changes: 225 additions & 0 deletions src/cookbooks/llama_index.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-05-01 17:04:54-- https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf\n",
"Resolving sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)... 162.243.189.2\n",
"Connecting to sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 379188 (370K) [application/pdf]\n",
"Saving to: ‘sample-docs/lyft-10k.pdf’\n",
"\n",
"sample-docs/lyft-10 100%[===================>] 370.30K 1.99MB/s in 0.2s \n",
"\n",
"2024-05-01 17:04:57 (1.99 MB/s) - ‘sample-docs/lyft-10k.pdf’ saved [379188/379188]\n",
"\n"
]
}
],
"source": [
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"\n",
"!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf -O sample-docs/lyft-10k.pdf"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# make sure llama-index is installed, it's not a formal dependency of open-parse\n",
"# %pip install llama-index"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished parsing\n"
]
}
],
"source": [
"import openparse\n",
"\n",
"doc_path = \"./sample-docs/lyft-10k.pdf\"\n",
"parser = openparse.DocumentParser()\n",
"parsed_doc = parser.parse(doc_path)\n",
"\n",
"print(\"Finished parsing\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node ID: 33747e2d-0478-4628-b112-d733b1fc5039\n",
"Text: Securities registered pursuant to Section 12(g) of the\n",
"Act:**None** Indicate by check mark if the Registrant is a well-known\n",
"seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒\n",
"No ☐ Indicate by check mark if the Registrant is not required to file\n",
"reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\n",
"Indicate by check ma...\n"
]
}
],
"source": [
"nodes = parsed_doc.to_llama_index_nodes()\n",
"\n",
"print(nodes[1])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id_': '33747e2d-0478-4628-b112-d733b1fc5039',\n",
" 'embedding': None,\n",
" 'metadata': {'bbox': [{'page': 0,\n",
" 'page_height': 792.0,\n",
" 'page_width': 612.0,\n",
" 'x0': 17.31,\n",
" 'y0': 332.53,\n",
" 'x1': 586.25,\n",
" 'y1': 424.21}]},\n",
" 'excluded_embed_metadata_keys': ['bbox'],\n",
" 'excluded_llm_metadata_keys': ['bbox'],\n",
" 'relationships': {<NodeRelationship.PREVIOUS: '2'>: {'node_id': '59644551-d995-4d0a-88f5-49bbf22f0617',\n",
" 'node_type': <ObjectType.TEXT: '1'>,\n",
" 'metadata': {'bbox': [{'page': 0,\n",
" 'page_height': 792.0,\n",
" 'page_width': 612.0,\n",
" 'x0': 17.31,\n",
" 'y0': 457.56,\n",
" 'x1': 590.92,\n",
" 'y1': 743.41}]},\n",
" 'hash': '77baa9ef95633b4c77c243ed3db29b4555c4f1f78c5b68f620eb8c4ff7f0a480',\n",
" 'class_name': 'RelatedNodeInfo'},\n",
" <NodeRelationship.NEXT: '3'>: {'node_id': '50744a8a-4ccb-4efa-a625-a1e7e3feec0c',\n",
" 'node_type': <ObjectType.TEXT: '1'>,\n",
" 'metadata': {'bbox': [{'page': 0,\n",
" 'page_height': 792.0,\n",
" 'page_width': 612.0,\n",
" 'x0': 17.31,\n",
" 'y0': 211.34,\n",
" 'x1': 586.62,\n",
" 'y1': 290.85}]},\n",
" 'hash': '965f5304799146fde0d2bb8fb5726c0646ddf46df3ada30a902adee9005c2333',\n",
" 'class_name': 'RelatedNodeInfo'},\n",
" <NodeRelationship.PARENT: '4'>: {'node_id': 'dc94d72c-ec16-41b7-9e01-f359284464d2',\n",
" 'node_type': <ObjectType.DOCUMENT: '4'>,\n",
" 'metadata': {'file_name': 'lyft-10k.pdf',\n",
" 'file_size': 379188,\n",
" 'creation_date': '2024-05-01',\n",
" 'last_modified_date': '2024-04-07'},\n",
" 'hash': '60b974c64ec56d53a58cfe7703901cd049f6f11c39af6612861193672cc07bd9',\n",
" 'class_name': 'RelatedNodeInfo'}},\n",
" 'text': 'Securities registered pursuant to Section 12(g) of the Act:**None**\\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\\nIndicate by check mark whether the Registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such\\nshorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during\\nthe preceding 12 months (or for such shorter period that the Registrant was required to submit such files). Yes ☒ No ☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, smaller reporting company, or an emerging growth company. See the definitions of\\n“large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.',\n",
" 'start_char_idx': None,\n",
" 'end_char_idx': None,\n",
" 'text_template': '{metadata_str}\\n\\n{content}',\n",
" 'metadata_template': '{key}: {value}',\n",
" 'metadata_seperator': '\\n',\n",
" 'class_name': 'TextNode'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nodes[1].dict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Let's add the nodes to a vector store"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import VectorStoreIndex\n",
"\n",
"index = VectorStoreIndex(nodes=nodes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Now let's query our index"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"They generate revenue primarily from service fees and commissions collected from drivers for their use of the ridesharing marketplace. Additionally, they earn revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by providing their ridesharing marketplace to organizations through Lyft Business offerings. In the second quarter of 2021, they also started generating revenues from licensing and data access agreements with third-party autonomous vehicle companies.\n"
]
}
],
"source": [
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\"What do they do to make money?\")\n",
"print(response)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "open-parse-notebooks",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
4 changes: 4 additions & 0 deletions src/openparse/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ def parse(
table_parsing_kwargs=(
table_args_obj.model_dump() if table_args_obj else None
),
creation_date=doc.file_metadata.get("creation_date"),
last_modified_date=doc.file_metadata.get("last_modified_date"),
last_accessed_date=doc.file_metadata.get("last_accessed_date"),
file_size=doc.file_metadata.get("file_size"),
)
return parsed_doc

Expand Down
Loading

0 comments on commit 7c22265

Please sign in to comment.