Facing problem when extract table by using"text" strategy #543
youpengbo2018
started this conversation in
Ask for help with specific PDFs
Replies: 2 comments
-
Hi @youpengbo2018 Appreciate your interest in the library. When you do im = page.to_image(resolution=200)
im.draw_lines(page.curves+page.edges) you'll notice that the there are some hidden horizontal and vertical line separators. You can use the import pdfplumber
def get_vertical_lines(page):
"""
Run table extraction using the default lines strategy and get the vertical lines
from the first row.
"""
tables = page.find_tables(
table_settings={"vertical_strategy": "lines", "horizontal_strategy": "lines"}
)
first_row = tables[0].rows[0]
return [cell[0] for cell in first_row.cells] + [first_row.cells[-1][2]]
def get_horizontal_lines(page):
"""
Get the coordinates of all the horizontal lines.
"""
return [page.height - edge["y0"] for edge in page.horizontal_edges]
pdf = pdfplumber.open("file.pdf")
page = pdf.pages[0]
table = page.extract_table(
table_settings={
"vertical_strategy": "explicit",
"explicit_vertical_lines": get_vertical_lines(page),
"horizontal_strategy": "explicit",
"explicit_horizontal_lines": get_horizontal_lines(page),
}
)
for row in table:
print(row) Result is
This is very specific to the PDF you shared and may not be a plug-and-play solution but I hope it has put you in the right direction. You can modify the above code to suit your needs better. |
Beta Was this translation helpful? Give feedback.
0 replies
-
thank you ! it helps me a lot |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
2021年1-4月份主要经济指标.pdf
Hi , this is the code I used to extract the table.After running the code, I found that I can not get th full row data of the final row. the row shows ['None','None','None','18.2','5459','27',],actually I need the result to be ['公路运输','万吨','1701.0','18.2','5459','27'].Could you help me to fix it?
import pdfplumber
import pandas as pd
from decimal import Decimal
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 100)
def analyze_pdf(file_path):
result = ""
with pdfplumber.open(file_path) as pdf:
for i in range(pdf.pages[-1].page_number):
page = pdf.pages[i] #按索引读取pdf页数
for table in page.extract_tables(table_settings={"vertical_strategy": "lines"
,"explicit_vertical_lines":[Decimal(page.width)*Decimal(0.03),Decimal(page.width)*Decimal(0.36),Decimal(page.width)*Decimal(0.56),Decimal(page.width)*Decimal(0.691),Decimal(page.width)*Decimal(0.781),Decimal(page.width)*Decimal(0.91)]
,"explicit_horizontal_lines":[Decimal(page.height)*Decimal(0.001)]
,"horizontal_strategy": "text"
# ,"intersection_x_tolerance":30
}):
# print(page.horizontal_edges)
print(page.width)
print(page.bbox)
# # print(table[1:])
print("table",table)
df = pd.DataFrame(table,columns=['指标','1','单位',"本月",'同比增长','1-本月','同比增长(%)'])
del df['1']
print(df)
df1 = df[~(df['指标']== 'None')]
print(df1)
df.to_csv(r'F:\work\沈阳数据\text.csv')
# for r in table[1:]:
# print(type(r), r)
# result += r
# result += '\t'.join(r)
# result += '\t'.join('%s' %id for id in r)
# result += '\n'
pass
if name == 'main':
file_Path=r'F:\work\沈阳数据\2021年1-4月份主要经济指标.pdf'
print(analyze_pdf(file_Path))
# text_save(r'F:\work\沈阳数据\2021年1-4月份主要经济指标.txt', analyze_pdf(file_Path))
Beta Was this translation helpful? Give feedback.
All reactions