-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdiff-pdf-page-statistics.py
446 lines (392 loc) · 26.8 KB
/
diff-pdf-page-statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/usr/bin/python3
#NOTE: you probably need to increase the cache allowed: /etc/ImageMagick-*/policy.xml
# <policy domain="resource" name="disk" value="16GiB"/>
# find docx/ -name "*.docx" -execdir basename {} \; | xargs -L1 -I{} python3 ../diff-pdf-page-statistics.py --no_save_overlay --base_file="{}"
# - delete any CSV files. Delete the import/export folders in converted
# - rename converted DOC/PPT/XLS pdfs from .docx_mso.pdf to .doc_mso.pdf, etc.
# WARNING: this is not a tool for administrative statistics; there are just too many false positives for counts of red dots to be meaningful.
# It is only meant to be useful for QA in identifying regressions.
# Given:
# - a document (in the "download/file_type/" folder)
# - an "authoritative" _mso.PDF of how the document should look (also in the "download/file_type/" folder)
# - a Collabora .PDF of the document (in the "converted/original_file_type" folder)
# - a Microsoft .original_file_type_mso.PDF of the Collabora-round-tripped file (in the "converted/original_file_type" folder)
# - history folder: a copy of the converted folder from a previous Collabora version (identifying the commit range to search for regressions)
#
# Running the tool:
# - cd into the history folder (or specify the folder with --history_dir=)
# - ../diff-pdf-page-statistics.py --base_file=document_name.ext
# - look at the import/export overlay PNG results in the converted folder
#
# False positives:
# - automatically updating fields: dates, =rand(), slide date/time ...
# - different font subsitutions
import argparse
import os
import wand # pip install wand && OS_INSTALLER install imagemagick
from wand.image import Image
from wand.display import display
from wand.exceptions import PolicyError
from wand.exceptions import CacheError
import time
def printdebug(debug, *args, **kwargs):
"""
A conditional debug print function.
Prints messages only if the DEBUG variable is True.
Parameters:
*args: Positional arguments to pass to print().
**kwargs: Keyword arguments to pass to print().
"""
if debug:
print(*args, **kwargs)
def main():
parser = argparse.ArgumentParser(description="Look for import and export regressions.")
parser.add_argument("--base_file", default="lorem ipsum.docx")
parser.add_argument("--history_dir", default=".")
parser.add_argument("--max_page", default="10") # limit PDF comparison to the first ten pages
parser.add_argument("--no_save_overlay", action="store_true") # default is false
parser.add_argument("--resolution", default="150")
parser.add_argument("--debug", action="store_true") # default is false
args = parser.parse_args()
DEBUG = args.debug
MAX_PAGES = int(args.max_page)
# Exclude notorious false positives that have no redeeming value in constantly brought to the attention of QA
if (
args.base_file == 'forum-mso-de-108371.xlsx' # =rand()
or args.base_file == 'forum-mso-de-70016.docx' # =rand()
or args.base_file == 'forum-mso-en-1268.docx' # =rand()
or args.base_file == 'forum-fr-9115.doc' # date/time/temp-filename field
or args.base_file == 'forum-fr-17720.doc' # date/time/temp-filename field
or args.base_file == 'forum-mso-de-79405.docx' # date/time/temp-filename field
or args.base_file == 'forum-mso-de-90801.docx' # date/time/temp-filename field
or args.base_file == 'forum-mso-de-92011.docx' # date/time/temp-filename field
or args.base_file == 'forum-mso-de-92780.docx' # date/time/temp-filename field
or args.base_file == 'forum-mso-en-10944.docx' # date/time/temp-filename field
#or args.base_file == 'forum-mso-en4-282494.docx' # date/time/temp-filename field
or args.base_file == 'tdf130041-1.docx' # date/time/temp-filename field
or args.base_file == 'forum-fr-16236.docx' # effective duplicate
or args.base_file == 'forum-fr-16238.docx' # effective duplicate
or args.base_file == 'forum-mso-de-54647.docx' # effective duplicate
or args.base_file == 'forum-mso-de-126251.docx' # effective duplicate
or args.base_file == 'forum-mso-de-139701.docx' # effective duplicate
or args.base_file == 'forum-mso-en-3785.docx' # effective duplicate
or args.base_file == 'forum-mso-en-3786.docx' # effective duplicate
or args.base_file == 'forum-mso-en-5125.docx' # effective duplicate
or args.base_file == 'forum-mso-en-5126.docx' # effective duplicate
):
print("SKIPPING FILE", args.base_file, ": determined to be unusable for testing...")
exit(0)
print ("Processing: ", args.base_file)
base_dir = "./"
if args.history_dir == "." and not os.path.isdir('download') and os.path.isdir(os.path.join("..", 'download')):
base_dir = "../"
file_ext = os.path.splitext(args.base_file)
MS_ORIG = os.path.join(base_dir, "download", file_ext[1][1:], args.base_file + "_mso.pdf")
if not os.path.isfile(MS_ORIG):
print ("original PDF file [" + MS_ORIG +"] not found")
exit (1)
LO_ORIG = os.path.join(base_dir, "converted", file_ext[1][1:], file_ext[0] + ".pdf")
if not os.path.isfile(LO_ORIG):
print ("Collabora PDF file [" + LO_ORIG +"] not found")
exit (1)
MS_CONV = os.path.join(base_dir, "converted", file_ext[1][1:], args.base_file + "_mso.pdf")
if not os.path.isfile(MS_CONV):
print ("MS converted PDF file [" + MS_CONV +"] not found")
exit (1)
# This tool is not very useful without having a previous run to compare against.
# However, it still can create overlay images, which might be of some value.
LO_PREV = os.path.join(args.history_dir, file_ext[1][1:], file_ext[0] + ".pdf")
IS_FILE_LO_PREV = os.path.isfile(LO_PREV)
MS_PREV = os.path.join(args.history_dir, file_ext[1][1:], args.base_file + "_mso.pdf")
IS_FILE_MS_PREV = os.path.isfile(MS_PREV)
# MSO's PDF of the exported file needs to be manually renamed to match the base_file nane. Helpfully exit if we detect that has not occurred yet.
if file_ext[1] == ".doc" and not IS_FILE_MS_PREV and os.path.isfile(os.path.join(args.history_dir, file_ext[1][1:], file_ext[0] + ".docx_mso.pdf")):
print ("Previous MS converted PDF not renamed. rename s/.docx_mso.pdf/.doc_mso.pdf/ doc/*.docx_mso.pdf")
exit (1)
if file_ext[1] == ".xls" and not IS_FILE_MS_PREV and os.path.isfile(os.path.join(args.history_dir, file_ext[1][1:], file_ext[0] + ".xlsx_mso.pdf")):
print ("Previous MS converted PDF not renamed. rename s/.xlsx_mso.pdf/.xls_mso.pdf/ xls/*.xlsx_mso.pdf")
exit (1)
if file_ext[1] == ".ppt" and not IS_FILE_MS_PREV and os.path.isfile(os.path.join(args.history_dir, file_ext[1][1:], file_ext[0] + ".pptx_mso.pdf")):
print ("Previous MS converted PDF not renamed. rename s/.pptx_mso.pdf/.ppt_mso.pdf/ ppt/*.pptx_mso.pdf")
exit (1)
IMPORT_DIR = os.path.join(base_dir, "converted", "import", file_ext[1][1:])
if not os.path.isdir(IMPORT_DIR):
os.makedirs(IMPORT_DIR)
EXPORT_DIR = os.path.join(base_dir, "converted", "export", file_ext[1][1:])
if not os.path.isdir(EXPORT_DIR):
os.makedirs(EXPORT_DIR)
IMPORT_COMPARE_DIR = os.path.join(base_dir, "converted", "import-compare", file_ext[1][1:])
if not os.path.isdir(IMPORT_COMPARE_DIR):
os.makedirs(IMPORT_COMPARE_DIR)
EXPORT_COMPARE_DIR = os.path.join(base_dir, "converted", "export-compare", file_ext[1][1:])
if not os.path.isdir(EXPORT_COMPARE_DIR):
os.makedirs(EXPORT_COMPARE_DIR)
try:
# The "correct" PDF: created by MS Word of the original file
MS_ORIG_PDF = Image(filename=MS_ORIG, resolution=int(args.resolution))
# A PDF of how it is displayed in Writer - to be compared to MS_ORIG
LO_ORIG_PDF = Image(filename=LO_ORIG, resolution=int(args.resolution))
# A PDF of how MS Word displays Writer's round-tripped file - to be compared to MS_ORIG
MS_CONV_PDF = Image(filename=MS_CONV, resolution=int(args.resolution))
# A historical version of how it was displayed in Writer
LO_PREV_PDF = Image()
LO_PREV_PAGES = MAX_PAGES
if IS_FILE_LO_PREV:
LO_PREV_PDF = Image(filename=LO_PREV, resolution=int(args.resolution))
LO_PREV_PAGES = len(LO_PREV_PDF.sequence)
# A historical version of how the round-tripped file was displayed in Word
MS_PREV_PDF = Image()
MS_PREV_PAGES = MAX_PAGES
if IS_FILE_MS_PREV:
MS_PREV_PDF = Image(filename=MS_PREV, resolution=int(args.resolution))
MS_PREV_PAGES=len(MS_PREV_PDF.sequence)
except PolicyError:
print("Warning: Operation not allowed due to security policy restrictions for PDF files.")
print("Please modify the '/etc/ImageMagick-6/policy.xml' file to allow PDF processing.")
print("<policy domain=\"coder\" rights=\"read\" pattern=\"PDF\" />")
exit(1)
except CacheError as e:
print("Exception message: ", str(e))
print("You probably need to increase the cache allowed in /etc/ImageMagick-6/policy.xml")
print("<policy domain=\"resource\" name=\"disk\" value=\"16GiB\"/>")
exit(1)
pages = min(MAX_PAGES, len(MS_ORIG_PDF.sequence), len(LO_ORIG_PDF.sequence), len(MS_CONV_PDF.sequence), LO_PREV_PAGES, MS_PREV_PAGES)
printdebug(DEBUG, "DEBUG ", args.base_file, " pages[", pages, "] ", MAX_PAGES, len(MS_ORIG_PDF.sequence), len(LO_ORIG_PDF.sequence), len(MS_CONV_PDF.sequence), len(LO_PREV_PDF.sequence), len(MS_PREV_PDF.sequence))
MS_ORIG_SIZE = [] # total number of pixels on the page
MS_ORIG_CONTENT = [] # the number of non-background pixels
LO_ORIG_SIZE = []
LO_ORIG_CONTENT = []
MS_CONV_SIZE = []
MS_CONV_CONTENT = []
LO_PREV_SIZE = []
LO_PREV_CONTENT = []
MS_PREV_SIZE = []
MS_PREV_CONTENT = []
IMPORT_RED = [] # the number of red pixels on the page
EXPORT_RED = []
PREV_IMPORT_RED = []
PREV_EXPORT_RED = []
MS_ORIG_RED = MS_ORIG_PDF.clone()
for pgnum in range(0, pages):
with MS_ORIG_RED.sequence[pgnum] as page: # need this 'with' clause so that MS_ORIG_RED is actually updated with the following changes
MS_ORIG_SIZE.append(page.height * page.width)
page.transform_colorspace('gray')
page.alpha_channel = 'remove' # so that 'red' will be painted as 'red' and not some transparent-ized shade of red
page.opaque_paint('black', 'red', fuzz=MS_ORIG_PDF.quantum_range * 0.90)
# Composed image: overlay red MS_ORIG with LO_ORIG
IMPORT_IMAGE = MS_ORIG_RED.clone()
# Composed image: overlay red MS_ORIG with MS_CONV
EXPORT_IMAGE = MS_ORIG_RED.clone()
# Composed image: overlay red MS_ORIG with LO_PREV
PREV_IMPORT_IMAGE = MS_ORIG_RED.clone()
# Composed image: overlay red MS_ORIG with MS_PREV
PREV_EXPORT_IMAGE = MS_ORIG_RED.clone()
# Composed image: overlay red LO_ORIG and blue LO_PREV with gray MS_ORIG_PDF
# This is the visual key to the whole tool. The red/blue underlay should be identical except for import fixes or regressions
IMPORT_COMPARE_IMAGE = LO_ORIG_PDF.clone()
# Composed image: overlay red MS_CONV and blue MS_PREV with gray MS_ORIG_PDF
# This is the visual key to the whole tool. The red/blue underlay should be identical except for export fixes or regressions
EXPORT_COMPARE_IMAGE = MS_CONV_PDF.clone()
for pgnum in range(0, pages):
tmp = MS_ORIG_PDF.clone() # don't make changes to these PDF pages - just get statistics...
with tmp.sequence[pgnum] as page:
page.quantize(2)
MS_ORIG_SIZE.append(page.height * page.width)
MS_ORIG_CONTENT.append(min(list(page.histogram.values()))) # assuming that the background is more than 50%
printdebug(DEBUG, "DEBUG LO_ORIG[", pgnum, "] size[", MS_ORIG_SIZE[pgnum], "] content[", MS_ORIG_CONTENT[pgnum], "] percent[", (MS_ORIG_CONTENT[pgnum] / MS_ORIG_SIZE[pgnum]), "] colorspace[", page.colorspace, "] background[", page.background_color, "] ", list(page.histogram.values()), list(page.histogram.keys()))
tmp = LO_ORIG_PDF.clone()
with tmp.sequence[pgnum] as page:
page.quantize(2)
LO_ORIG_SIZE.append(page.height * page.width)
LO_ORIG_CONTENT.append(min(list(page.histogram.values()))) # assuming that the background is more than 50%
printdebug(DEBUG, "DEBUG LO_ORIG[", pgnum, "] size[", LO_ORIG_SIZE[pgnum], "] content[", LO_ORIG_CONTENT[pgnum], "] percent[", (LO_ORIG_CONTENT[pgnum] / LO_ORIG_SIZE[pgnum]), "] colorspace[", page.colorspace, "] background[", page.background_color, "] ", list(page.histogram.values()), list(page.histogram.keys()))
tmp = MS_CONV_PDF.clone()
with tmp.sequence[pgnum] as page:
page.quantize(2)
MS_CONV_SIZE.append(page.height * page.width)
MS_CONV_CONTENT.append(min(list(page.histogram.values())))
printdebug(DEBUG, "DEBUG MS_CONV[", pgnum, "] size[", MS_CONV_SIZE[pgnum], "] content[", MS_CONV_CONTENT[pgnum], "] ", list(page.histogram.values()), list(page.histogram.keys()), " percent[", MS_CONV_CONTENT[pgnum] / MS_CONV_SIZE[pgnum], "]")
if IS_FILE_LO_PREV:
tmp = LO_PREV_PDF.clone()
with tmp.sequence[pgnum] as page:
page.quantize(2)
LO_PREV_SIZE.append(page.height * page.width)
LO_PREV_CONTENT.append(min(list(page.histogram.values())))
printdebug(DEBUG, "DEBUG LO_PREV[", pgnum, "] size[", LO_PREV_SIZE[pgnum], "] content[", LO_PREV_CONTENT[pgnum], "] ", list(page.histogram.values()), list(page.histogram.keys()), " percent[", LO_PREV_CONTENT[pgnum] / LO_PREV_SIZE[pgnum], "]")
if IS_FILE_MS_PREV:
tmp = MS_PREV_PDF.clone()
with tmp.sequence[pgnum] as page:
page.quantize(2)
MS_PREV_SIZE.append(page.height * page.width)
MS_PREV_CONTENT.append(min(list(page.histogram.values())))
printdebug(DEBUG, "DEBUG MS_PREV[", pgnum, "] size[", MS_PREV_SIZE[pgnum], "] content[", MS_PREV_CONTENT[pgnum], "] ", list(page.histogram.values()), list(page.histogram.keys()), " percent[", MS_PREV_CONTENT[pgnum] / MS_PREV_SIZE[pgnum], "]")
with IMPORT_IMAGE.sequence[pgnum] as page:
LO_ORIG_PDF.sequence[pgnum].transform_colorspace('gray')
LO_ORIG_PDF.sequence[pgnum].transparent_color(LO_ORIG_PDF.background_color, 0, fuzz=LO_ORIG_PDF.quantum_range * 0.10)
#display(Image(page)) #debug
#display(Image(LO_ORIG_PDF.sequence[pgnum])) #debug
page.composite(LO_ORIG_PDF.sequence[pgnum]) # overlay (red) MS_ORIG with LO_ORIG
page.merge_layers('flatten')
#display(Image(page)) #debug
IMPORT_RED.append(0)
try:
IMPORT_RED[pgnum] = IMPORT_IMAGE.sequence[pgnum].histogram[wand.color.Color('red')]
except:
printdebug(DEBUG, "IMPORT EXCEPTION: could not get red color from page ", pgnum)#, list(IMPORT_IMAGE.sequence[pgnum].histogram.keys()))
with EXPORT_IMAGE.sequence[pgnum] as page:
MS_CONV_PDF.sequence[pgnum].transform_colorspace('gray')
MS_CONV_PDF.sequence[pgnum].transparent_color(MS_CONV_PDF.background_color, 0, fuzz=MS_CONV_PDF.quantum_range * 0.10)
page.composite(MS_CONV_PDF.sequence[pgnum]) # overlay (red) MS_ORIG with MS_CONV
page.merge_layers('flatten')
EXPORT_RED.append(0)
try:
EXPORT_RED[pgnum] = EXPORT_IMAGE.sequence[pgnum].histogram[wand.color.Color('red')]
except:
printdebug(DEBUG, "EXPORT EXCEPTION: could not get red color from page ", pgnum)# , list(EXPORT_IMAGE.sequence[pgnum].histogram.keys()))
PREV_IMPORT_RED.append(0)
if IS_FILE_LO_PREV:
with PREV_IMPORT_IMAGE.sequence[pgnum] as page:
LO_PREV_PDF.sequence[pgnum].transform_colorspace('gray')
LO_PREV_PDF.sequence[pgnum].transparent_color(LO_PREV_PDF.background_color, 0, fuzz=LO_PREV_PDF.quantum_range * 0.10)
page.composite(LO_PREV_PDF.sequence[pgnum]) # overlay (red) MS_ORIG with LO_PREV
page.merge_layers('flatten')
try:
PREV_IMPORT_RED[pgnum] = PREV_IMPORT_IMAGE.sequence[pgnum].histogram[wand.color.Color('red')]
except:
printdebug(DEBUG, "PREV_IMPORT EXCEPTION: could not get red color from page ", pgnum)#, list(PREV_IMPORT_IMAGE.sequence[pgnum].histogram.keys()))
with IMPORT_COMPARE_IMAGE.sequence[pgnum] as page:
page.transform_colorspace('gray')
page.opaque_paint('black', 'red', fuzz=LO_ORIG_PDF.quantum_range * 0.90)
LO_PREV_PDF.sequence[pgnum].transparent_color(LO_PREV_PDF.background_color, 0, fuzz=LO_PREV_PDF.quantum_range * 0.10)
LO_PREV_PDF.sequence[pgnum].opaque_paint('black', 'blue', fuzz=LO_PREV_PDF.quantum_range * 0.90)
page.composite(LO_PREV_PDF.sequence[pgnum]) # overlay (red) LO_ORIG with (blue) LO_PREV
MS_ORIG_PDF.sequence[pgnum].transparent_color(MS_ORIG_PDF.background_color, 0, fuzz=MS_ORIG_PDF.quantum_range * 0.10)
MS_ORIG_PDF.sequence[pgnum].transform_colorspace('gray')
page.composite(MS_ORIG_PDF.sequence[pgnum]) # overlay both with the authoritative contents in gray
page.merge_layers('flatten')
PREV_EXPORT_RED.append(0)
if IS_FILE_MS_PREV:
with PREV_EXPORT_IMAGE.sequence[pgnum] as page:
MS_PREV_PDF.sequence[pgnum].transform_colorspace('gray')
MS_PREV_PDF.sequence[pgnum].transparent_color(MS_PREV_PDF.background_color, 0, fuzz=MS_PREV_PDF.quantum_range * 0.10)
page.composite(MS_PREV_PDF.sequence[pgnum]) # overlay (red) MS_ORIG with MS_PREV
page.merge_layers('flatten')
try:
PREV_EXPORT_RED[pgnum] = PREV_EXPORT_IMAGE.sequence[pgnum].histogram[wand.color.Color('red')]
except:
printdebug(DEBUG, "PREV_EXPORT EXCEPTION: could not get red color from page ", pgnum)#, list(PREV_EXPORT_IMAGE.sequence[pgnum].histogram.keys()))
with EXPORT_COMPARE_IMAGE.sequence[pgnum] as page:
page.transform_colorspace('gray')
page.opaque_paint('black', 'red', fuzz=MS_CONV_PDF.quantum_range * 0.90)
MS_PREV_PDF.sequence[pgnum].transparent_color(MS_PREV_PDF.background_color, 0, fuzz=MS_PREV_PDF.quantum_range * 0.10)
MS_PREV_PDF.sequence[pgnum].opaque_paint('black', 'blue', fuzz=MS_PREV_PDF.quantum_range * 0.90)
page.composite(MS_PREV_PDF.sequence[pgnum]) # overlay (red) MS_CONV with (blue) MS_PREV
MS_ORIG_PDF.sequence[pgnum].transparent_color(MS_ORIG_PDF.background_color, 0, fuzz=MS_ORIG_PDF.quantum_range * 0.10)
MS_ORIG_PDF.sequence[pgnum].transform_colorspace('gray')
page.composite(MS_ORIG_PDF.sequence[pgnum]) # overlay both with the authoritative contents in gray
page.merge_layers('flatten')
for pageToSave in range(0, pages):
FORCE_SAVE_IMPORT = False
FORCE_SAVE_EXPORT = False
if args.no_save_overlay == True:
# Always provide pages that have more RED now than in the previous version - QA needs to check them out (at least the first one).
if IS_FILE_LO_PREV and IMPORT_RED[pageToSave] > PREV_IMPORT_RED[pageToSave]:
FORCE_SAVE_IMPORT = True
if IS_FILE_MS_PREV and EXPORT_RED[pageToSave] > PREV_EXPORT_RED[pageToSave]:
FORCE_SAVE_EXPORT = True
if args.no_save_overlay == False or FORCE_SAVE_IMPORT:
printdebug(DEBUG, f"DEBUG saving {args.base_file} page {pageToSave+1} IMPORT[{IMPORT_RED[pageToSave]} PREV[{PREV_IMPORT_RED[pageToSave]}]")
with Image(IMPORT_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(IMPORT_DIR, args.base_file + f"_import-{pageToSave}.png")
img_to_save.save(filename=file_name)
if IS_FILE_LO_PREV:
with Image(PREV_IMPORT_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(IMPORT_DIR, args.base_file + f"_prev-import-{pageToSave}.png")
img_to_save.save(filename=file_name)
with Image(IMPORT_COMPARE_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(IMPORT_COMPARE_DIR, args.base_file + f"_import-compare-{pageToSave}.png")
img_to_save.save(filename=file_name)
if args.no_save_overlay == False or FORCE_SAVE_EXPORT:
printdebug(DEBUG, f"DEBUG saving {args.base_file} page {pageToSave+1} EXPORT[{EXPORT_RED[pageToSave]} PREV[{PREV_EXPORT_RED[pageToSave]}]")
with Image(EXPORT_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(EXPORT_DIR, args.base_file + f"_export-{pageToSave}.png")
img_to_save.save(filename=file_name)
if IS_FILE_MS_PREV:
with Image(PREV_EXPORT_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(EXPORT_DIR, args.base_file + f"_prev-export-{pageToSave}.png")
img_to_save.save(filename=file_name)
with Image(EXPORT_COMPARE_IMAGE.sequence[pageToSave]) as img_to_save:
file_name=os.path.join(EXPORT_COMPARE_DIR, args.base_file + f"_export-compare-{pageToSave}.png")
img_to_save.save(filename=file_name)
# allow the script to run in parallel - wait for lock on report to be released.
# if lock file exists, wait for one second and try again
# else
# create lock file and put the file name in it
# wait for a bit and then read to verify the lock is mine
while True:
LOCK_FILE="diff-pdf-" + file_ext[1][1:] + "-statistics.lock"
if os.path.isfile(LOCK_FILE):
printdebug(DEBUG, "DEBUG: waiting for file to unlock")
else:
with open(LOCK_FILE, 'w') as f:
f.write(args.base_file)
time.sleep(0.1) # one tenth of a second
with open(LOCK_FILE, 'r') as f:
LOCK = f.read()
printdebug(DEBUG, "DEBUG LOCK[", LOCK, "]")
if LOCK == args.base_file:
with open('diff-pdf-' + file_ext[1][1:] + '-import-statistics.csv', 'a') as f:
for pgnum in range(0, pages):
OUT_STRING = [ args.base_file ]
OUT_STRING.append(str(pgnum + 1)) # use a human-oriented 1-based number for reporting...
OUT_STRING.append(str(MS_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(MS_ORIG_CONTENT[pgnum]))
OUT_STRING.append(str(MS_ORIG_CONTENT[pgnum] / MS_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(LO_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(LO_ORIG_CONTENT[pgnum]))
OUT_STRING.append(str(LO_ORIG_CONTENT[pgnum] / LO_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(IMPORT_RED[pgnum]))
OUT_STRING.append(str(IMPORT_RED[pgnum] / LO_ORIG_CONTENT[pgnum]))
if IS_FILE_LO_PREV:
OUT_STRING.append(str(LO_PREV_SIZE[pgnum]))
OUT_STRING.append(str(LO_PREV_CONTENT[pgnum]))
OUT_STRING.append(str(LO_PREV_CONTENT[pgnum] / LO_PREV_SIZE[pgnum]))
OUT_STRING.append(str(PREV_IMPORT_RED[pgnum]))
OUT_STRING.append(str(PREV_IMPORT_RED[pgnum] / LO_PREV_CONTENT[pgnum]))
f.write(','.join(OUT_STRING) + '\n')
with open('diff-pdf-' + file_ext[1][1:] + '-export-statistics.csv', 'a') as f:
for pgnum in range(0, pages):
OUT_STRING = [ args.base_file ]
OUT_STRING.append(str(pgnum + 1)) # use a human-oriented 1-based number for reporting...
OUT_STRING.append(str(MS_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(MS_ORIG_CONTENT[pgnum]))
OUT_STRING.append(str(MS_ORIG_CONTENT[pgnum] / MS_ORIG_SIZE[pgnum]))
OUT_STRING.append(str(MS_CONV_SIZE[pgnum]))
OUT_STRING.append(str(MS_CONV_CONTENT[pgnum]))
OUT_STRING.append(str(MS_CONV_CONTENT[pgnum] / MS_CONV_SIZE[pgnum]))
OUT_STRING.append(str(EXPORT_RED[pgnum]))
OUT_STRING.append(str(EXPORT_RED[pgnum] / MS_CONV_CONTENT[pgnum]))
if IS_FILE_MS_PREV:
OUT_STRING.append(str(MS_PREV_SIZE[pgnum]))
OUT_STRING.append(str(MS_PREV_CONTENT[pgnum]))
OUT_STRING.append(str(MS_PREV_CONTENT[pgnum] / MS_PREV_SIZE[pgnum]))
OUT_STRING.append(str(PREV_EXPORT_RED[pgnum]))
OUT_STRING.append(str(PREV_EXPORT_RED[pgnum] / MS_PREV_CONTENT[pgnum]))
f.write(','.join(OUT_STRING) + '\n')
with open('diff-pdf-' + file_ext[1][1:] + '-statistics-anomalies.csv', 'a') as f:
if IS_FILE_LO_PREV and len(LO_ORIG_PDF.sequence) != LO_PREV_PAGES:
f.write(args.base_file + f",import,page count different from {args.history_dir} [{LO_PREV_PAGES}] and converted [{len(LO_ORIG_PDF.sequence)}]. Should be[{len(MS_ORIG_PDF.sequence)}]" + '\n')
if IS_FILE_MS_PREV and len(MS_CONV_PDF.sequence) != MS_PREV_PAGES:
f.write(args.base_file + f",export,page count different from {args.history_dir} [{MS_PREV_PAGES}] and converted [{len(MS_CONV_PDF.sequence)}]. Should be[{len(MS_ORIG_PDF.sequence)}]" + '\n')
# Although absolute wrongs normally shouldn't be reported (only report a change from previous version) - wrong page count was requested to be an exception.
if len(LO_ORIG_PDF.sequence) != len(MS_ORIG_PDF.sequence):
f.write(args.base_file + f",import, absolute page count, {len(LO_ORIG_PDF.sequence)}, should be, {len(MS_ORIG_PDF.sequence)}" + '\n')
if len(MS_CONV_PDF.sequence) != len(MS_ORIG_PDF.sequence):
f.write(args.base_file + f",export, absolute page count, {len(MS_CONV_PDF.sequence)}, should be, {len(MS_ORIG_PDF.sequence)}" + '\n')
os.remove(LOCK_FILE)
return
else:
printdebug(DEBUG, "DEBUG: not my lock after all - try again")
time.sleep(1) # second
if __name__ == "__main__":
main()