-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmpdf.py
101 lines (90 loc) · 3.2 KB
/
xmpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""xmpdf.py: Xmpdf class definition."""
import pdftotext
import csv
import jsonpickle
from pgparse import parse, Email, Page
class Xmpdf:
"""
A class that parses the emails in a PDF.
...
Attributes
----------
file : obj
file or file-like object containing emails
pgcnt : int
number of pages in file_id
emails : list
email objects
error : str
errors encountered during processing
Methods
-------
info()
Returns high-level descriptive information about the PDF file
to_json()
Returns jsonified representation of Xmpdf object
to_csv(csv_filename)
Writes a CSV representation of the Xmpdf emails to csv_filename
"""
def __init__(self, pdf_file):
"""
Create Xmpdf object.
Takes a PDF file and creates an Xmpdf instance which holds a parsed
representation of the emails in the PDF in a dictionary.
"""
self.pgcnt = 0
self.emails = []
self.error = None
self.wcp = [] # word count per page
# self.lcp = [] # line count per page
# convert to text
try:
self.pdf = pdftotext.PDF(pdf_file, physical=True)
self.pgcnt = len(self.pdf)
self.__parse()
except Exception as e:
self.error = str(e)
def __parse(self):
i = 0
current_email = None
while i < self.pgcnt:
self.wcp.append(len(self.pdf[i].split()))
# self.lcp.append(len(self.pdf[i].split('\n')))
page = parse(self.pdf[i])
i += 1
if isinstance(page, Email):
if current_email:
self.emails.append(current_email)
current_email = page
current_email.page_number = i
current_email.page_count = 1
elif (isinstance(page, Page) and current_email):
current_email.body += page.body
current_email.page_count += 1
if current_email: # write last email
self.emails.append(current_email)
def info(self):
"""Return high-level descriptive information about the PDF file."""
error_str = ''
if self.error:
error_str = ', ' + self.error
if sum(self.wcp) == 0:
error_str = ', WARNING: 0 words detected - PDF probably needs OCR'
return f'{self.pgcnt} pages, {len(self.emails)} emails {error_str}'
def email_metadata(self):
"""Return key metadata elements for each email in the PDF."""
em_meta = []
for e in self.emails:
em_meta.append(e.info())
return(em_meta)
def to_json(self):
"""Return jsonified representation of Xmpdf object."""
return jsonpickle.encode(self, unpicklable=False, indent=4)
def to_csv(self, csv_file):
"""Write CSV representation of Xmpdf emails."""
if self.emails:
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"',
quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(self.emails[0].csv_header)
for e in self.emails:
csv_writer.writerow(e.flatten())