-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathamazon.py
608 lines (509 loc) · 24.3 KB
/
amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
"""Retrieves order invoices from Amazon.
This uses the `selenium` Python package in conjunction with `chromedriver` to
scrape the Amazon website.
Configuration:
==============
The following keys may be specified as part of the configuration dict:
- `credentials`: Required. Must be a `dict` with `'username'` and `'password'`
keys.
- `output_directory`: Required. Must be a `str` that specifies the path on the
local filesystem where the output will be written. If the directory does not
exist, it will be created.
- `dir_per_year`: Optional. If true (default is false), adds one subdirectory
to the output for each year's worth of transactions. Useful for filesystems
that struggle with very large directories. Probably not that useful for
actually finding anything, given the uselessness of Amazon's order ID
scheme.
- `amazon_domain`: Optional. Specifies the Amazon domain from which to download
orders. Must be one of `'.com'`, `'.co.cuk'` or `'.de'`. Defaults to
`'.com'`.
- `regular`: Optional. Must be a `bool`. If `True` (the default), download regular orders.
For domains other than `amazon_domain=".com"`, `True` downloads regular AND digital orders.
- `digital`: Optional. Must be a `bool` or `None`. If `True`, download digital
orders. Effective only for `amazon_domain=".com"`. Defaults to `True` for
`amazon_domain=".com"`. For other domains, digital invoices are downloaded
tgehter with regular invoices since there is no separate menu on the amazon website.
- `profile_dir`: Optional. If specified, must be a `str` that specifies the
path to a persistent Chrome browser profile to use. This should be a path
used solely for this single configuration; it should not refer to your normal
browser profile. If not specified, a fresh temporary profile will be used
each time.
- `order_groups`: Optional. If specified, must be a list of strings specifying the Amazon
order page "order groups" that will be scanned for orders to download. Order groups
include years (e.g. '2020'), as well as 'last 30 days' and 'past 3 months'.
- `download_preorder_invoices`: Optional. If specified and True, invoices for
preorders (i.e. orders that have not actually been charged yet) will be
skipped. Such preorder invoices are not typically useful for accounting
since they claim a card was charged even though it actually has not been
yet; they get replaced with invoices containing the correct information when
the order is actually fulfilled.
Output format:
==============
Each regular or digital order invoice is written in HTML format to the specified
`output_directory` using the naming scheme `<order-id>.html`,
e.g. `166-7926740-5141621.html` for a regular order invoice and
`D56-5204779-4181560.html` for a digital order invoice.
Example:
========
def CONFIG_amazon():
return dict(
module='finance_dl.amazon',
credentials={
'username': 'XXXXXX',
'password': 'XXXXXX',
},
output_directory=os.path.join(data_dir, 'amazon'),
# profile_dir is optional.
profile_dir=os.path.join(profile_dir, 'amazon'),
# order_groups is optional.
order_groups=['past 3 months'],
)
Interactive shell:
==================
From the interactive shell, type: `self.run()` to start the scraper.
"""
import dataclasses
import urllib.parse
import re
import logging
import os
import datetime
import dateutil.parser
import bs4
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from atomicwrites import atomic_write
from . import scrape_lib
from typing import List, Optional
logger = logging.getLogger('amazon_scrape')
@dataclasses.dataclass
class Domain():
top_level: str
sign_in: str
sign_out: str
# Find invoices.
your_orders: str
archived_orders: str
invoice: str
invoice_link: List[str]
order_summary: str
order_summary_hidden: bool
next: str
# Confirm invoice page
grand_total: str
grand_total_digital: str
order_cancelled: str
pre_order: str
digital_order: str
regular_order_placed: str
# .COM: digital orders have own order list
# other domains: digital orders are in the regular order list
digital_orders_menu: bool
digital_orders_menu_text: Optional[str] = None
fresh_fallback: Optional[str] = None
class DOT_COM(Domain):
def __init__(self) -> None:
super().__init__(
top_level='com',
sign_in='Sign In',
sign_out='Sign Out',
your_orders='Your Orders',
archived_orders='Archived Orders',
invoice='Invoice',
invoice_link=["View order", "View invoice"],
# View invoice -> regular/digital order, View order -> Amazon Fresh
fresh_fallback="View order",
order_summary='Order Summary',
order_summary_hidden=False,
next='Next',
grand_total='Grand Total:',
grand_total_digital='Grand Total:',
order_cancelled='Order Canceled',
pre_order='Pre-order',
digital_order='Digital Order: (.*)',
regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
digital_orders_menu=True,
digital_orders_menu_text='Digital Orders',
)
@staticmethod
def parse_date(date_str) -> datetime.date:
return dateutil.parser.parse(date_str).date()
class DOT_CO_UK(Domain):
def __init__(self) -> None:
super().__init__(
top_level='co.uk',
sign_in='Sign in',
sign_out='Sign out',
your_orders='Your Orders',
archived_orders='Archived Orders',
invoice='Invoice',
invoice_link=["View order", "View invoice"],
# View invoice -> regular/digital order, View order -> Amazon Fresh
fresh_fallback="View order",
order_summary='Order Summary',
order_summary_hidden=False,
next='Next',
grand_total='Grand Total:',
grand_total_digital='Grand Total:',
order_cancelled='Order Canceled',
pre_order='Pre-order',
digital_order='Digital Order: (.*)',
regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
digital_orders_menu=False,
)
@staticmethod
def parse_date(date_str) -> datetime.date:
return dateutil.parser.parse(date_str).date()
class DOT_DE(Domain):
def __init__(self) -> None:
super().__init__(
top_level='de',
sign_in='Anmelden',
sign_out='Abmelden',
your_orders='Meine Bestellungen',
archived_orders='Archivierte Bestellungen',
invoice='Rechnung',
invoice_link=["Bestelldetails anzeigen"],
fresh_fallback=None,
order_summary='Bestellübersicht',
order_summary_hidden=True,
next='Weiter',
grand_total='Gesamtsumme:',
grand_total_digital='Endsumme:',
order_cancelled='Order Canceled',
pre_order='Pre-order',
digital_order='Digitale Bestellung: (.*)',
regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})',
digital_orders_menu=False,
)
class _parserinfo(dateutil.parser.parserinfo):
MONTHS=[
('Jan', 'Januar'), ('Feb', 'Februar'), ('Mär', 'März'),
('Apr', 'April'), ('Mai', 'Mai'), ('Jun', 'Juni'),
('Jul', 'Juli'), ('Aug', 'August'), ('Sep', 'September'),
('Okt', 'Oktober'), ('Nov', 'November'), ('Dez', 'Dezember')
]
@staticmethod
def parse_date(date_str) -> datetime.date:
return dateutil.parser.parse(date_str, parserinfo=DOT_DE._parserinfo(dayfirst=True)).date()
DOMAINS = {
".com": DOT_COM,
".co.uk": DOT_CO_UK,
".de": DOT_DE
}
class Scraper(scrape_lib.Scraper):
def __init__(self,
credentials,
output_directory,
dir_per_year=False,
amazon_domain: str = ".com",
regular: bool = True,
digital: Optional[bool] = None,
order_groups: Optional[List[str]] = None,
download_preorder_invoices: bool = False,
**kwargs):
super().__init__(**kwargs)
if amazon_domain not in DOMAINS:
raise ValueError(f"Domain '{amazon_domain} not supported. Supported "
f"domains: {list(DOMAINS)}")
self.domain = DOMAINS[amazon_domain]()
self.credentials = credentials
self.output_directory = output_directory
self.dir_per_year = dir_per_year
self.logged_in = False
self.regular = regular
self.digital_orders_menu = digital if digital is not None else self.domain.digital_orders_menu
self.order_groups = order_groups
self.download_preorder_invoices = download_preorder_invoices
def check_url(self, url):
netloc_re = r'^([^\.@]+\.)*amazon.' + self.domain.top_level + '$'
result = urllib.parse.urlparse(url)
if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
raise RuntimeError('Reached invalid URL: %r' % url)
def check_after_wait(self):
self.check_url(self.driver.current_url)
def login(self):
logger.info('Initiating log in')
self.driver.get('https://www.amazon.' + self.domain.top_level)
if self.logged_in:
return
sign_out_links = self.find_elements_by_descendant_partial_text(self.domain.sign_out, 'a')
if len(sign_out_links) > 0:
logger.info('You must be already logged in!')
self.logged_in = True
return
logger.info('Looking for sign-in link')
sign_in_links, = self.wait_and_return(
lambda: self.find_visible_elements_by_descendant_partial_text(self.domain.sign_in, 'a')
)
self.click(sign_in_links[0])
logger.info('Looking for username link')
(username, ), = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//input[@type="email"]')
)
username.send_keys(self.credentials['username'])
username.send_keys(Keys.ENTER)
self.finish_login()
def finish_login(self):
logger.info('Looking for password link')
(password, ), = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//input[@type="password"]')
)
password.send_keys(self.credentials['password'])
logger.info('Looking for "remember me" checkbox')
(rememberMe, ) = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//input[@name="rememberMe"]')[0]
)
rememberMe.click()
with self.wait_for_page_load():
password.send_keys(Keys.ENTER)
logger.info('Logged in')
self.logged_in = True
def get_invoice_path(self, year, order_id):
if self.dir_per_year:
return os.path.join(self.output_directory, str(year), order_id + '.html')
return os.path.join(self.output_directory, order_id + '.html')
def get_order_id(self, href) -> str:
m = re.match('.*[&?]orderI[Dd]=((?:D)?[0-9\\-]+)(?:&.*)?$', href)
if m is None:
raise RuntimeError(
'Failed to parse order ID from href %r' % (href, ))
return m[1]
def get_orders(self, regular=True, digital_orders_menu=True):
invoice_hrefs = []
order_ids_seen = set()
order_ids_downloaded = frozenset([
name[:len(name)-5]
for _, _, files in os.walk(self.output_directory)
for name in files
if name.endswith('.html')
])
def get_invoice_urls():
initial_iteration = True
while True:
# break when there is no "next page"
# Problem: different site structures depending on country
# .com / .uk
# Order Summary buttons are directly visible and can be
# identified with href containing "orderID="
# but order summary may have different names, e.g. for Amazon Fresh orders
# .de
# only link with href containing "orderID=" is "Bestelldetails anzeigen" (=Order Details)
# which is not helpful
# order summary is hidden behind submenu which requires a click to be visible
def invoice_finder():
# order summary link is visible on page
elements_raw = self.driver.find_elements(
By.XPATH, '//a[contains(@href, "orderID=")]')
elements = []
for invoice_link in elements_raw:
if invoice_link.text not in self.domain.invoice_link:
# skip invoice if label is not known
# different labels are possible e.g. for regular orders vs. Amazon fresh
if invoice_link.text != "":
# log non-empty link texts -> may be new type
logger.debug(
'Skipping invoice due to unknown invoice_link.text: %s',
invoice_link.text)
else:
elements.append(invoice_link)
return elements
if initial_iteration:
invoices = invoice_finder()
else:
invoices, = self.wait_and_return(invoice_finder)
initial_iteration = False
def invoice_link_finder(invoice_link):
href = invoice_link.get_attribute('href')
order_id = self.get_order_id(href)
if self.domain.fresh_fallback is not None and invoice_link.text == self.domain.fresh_fallback:
# Amazon Fresh order, construct link to invoice
logger.info(" Found likely Amazon Fresh order. Falling back to direct invoice URL.")
tokens = href.split("/")
tokens = tokens[:4]
tokens[-1] = f"gp/css/summary/print.html?orderID={order_id}"
href = "/".join(tokens)
return (order_id, href)
def invoice_link_finder_hidden(invoice_link):
# get order id to later find the correct summary link
order_id=self.get_order_id(invoice_link.get_attribute('href'))
# get parent element to search for invoice menu button (has no orderID specified)
parent=invoice_link.find_element(By.XPATH,"./..")
# leading dot in './/' specifies to only search in children
popover=parent.find_elements(By.XPATH,'.//a[contains(@href, "invoice/invoice.html")]')
# depending on the order group the XPATH may be different
if len(popover) == 0:
popover=parent.find_elements(
By.XPATH,
f'.//a[contains(text(), {self.domain.invoice}) and @class="a-popover-trigger a-declarative"]')
# open invoice popover to extract invoice link
popover[0].click()
# submenu containing order summary takes some time to load after click
summary_link, = self.wait_and_locate(
(By.XPATH,'//a[contains(@href,"{}") and contains(text(),"{}")]'.format(order_id, self.domain.order_summary)))
if summary_link:
href = summary_link.get_attribute('href')
return (order_id, href)
else:
logger.info('Link extraction failed for order id: %r', order_id)
return (False, False)
for invoice_link in invoices:
if not self.domain.order_summary_hidden:
(order_id, href) = invoice_link_finder(invoice_link)
else:
(order_id, href) = invoice_link_finder_hidden(invoice_link)
if order_id:
if order_id in order_ids_seen:
logger.info('Skipping already-seen order id: %r', order_id)
continue
if order_id in order_ids_downloaded:
logger.info('Skipping already-downloaded invoice: %r', order_id)
continue
logger.info('Found order \'{}\''.format(order_id))
invoice_hrefs.append((href, order_id))
order_ids_seen.add(order_id)
# Find next link
next_links = self.find_elements_by_descendant_text_match(
f'. = "{self.domain.next}"', 'a', only_displayed=True)
if len(next_links) == 0:
logger.info('Found no more pages')
break
if len(next_links) != 1:
raise RuntimeError('More than one next link found')
with self.wait_for_page_load():
logging.info("Next page.")
self.click(next_links[0])
def retrieve_all_order_groups():
order_select_index = 0
while True:
(order_filter,), = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//select[@name="timeFilter"]')
)
order_select = Select(order_filter)
num_options = len(order_select.options)
if order_select_index >= num_options:
break
option = order_select.options[
order_select_index]
option_text = option.text.strip()
order_select_index += 1
if option_text == self.domain.archived_orders:
continue
if self.order_groups is not None and option_text not in self.order_groups:
logger.info('Skipping order group: %r', option_text)
continue
logger.info('Retrieving order group: %r', option_text)
if not option.is_selected():
with self.wait_for_page_load():
order_select.select_by_index(order_select_index - 1)
get_invoice_urls()
if regular:
# on co.uk, orders link is hidden behind the menu, hence not directly clickable
(orders_link,), = self.wait_and_return(
lambda: self.find_elements_by_descendant_text_match(f'. = "{self.domain.your_orders}"', 'a', only_displayed=False)
)
link = orders_link.get_attribute('href')
scrape_lib.retry(lambda: self.driver.get(link), retry_delay=2)
retrieve_all_order_groups()
if digital_orders_menu:
# orders in separate Digital Orders list (relevant for .COM)
# other domains list digital orders within the regular order list
(digital_orders_link,), = self.wait_and_return(
lambda: self.find_elements_by_descendant_text_match(
f'contains(., "{self.domain.digital_orders_menu_text}")', 'a', only_displayed=True)
)
scrape_lib.retry(lambda: self.click(digital_orders_link),
retry_delay=2)
retrieve_all_order_groups()
self.retrieve_invoices(invoice_hrefs)
def retrieve_invoices(self, invoice_hrefs):
for href, order_id in invoice_hrefs:
logger.info('Downloading invoice for order %r', order_id)
with self.wait_for_page_load():
self.driver.get(href)
# For digital orders, Amazon dynamically generates some of the information.
# Wait until it is all generated.
def get_source():
source = self.driver.page_source
if (
self.domain.grand_total in source or
self.domain.grand_total_digital in source or
self.domain.order_cancelled in source
):
return source
elif 'problem loading this order' in source:
raise ValueError(f'Failed to retrieve information for order {order_id}')
elif self.find_visible_elements(By.XPATH, '//input[@type="password"]'):
self.finish_login() # fallthrough
return None
page_source, = self.wait_and_return(get_source)
if self.domain.pre_order in page_source and not self.download_preorder_invoices:
# Pre-orders don't have enough information to download yet. Skip them.
logger.info(f'Skipping pre-order invoice {order_id}')
return
if order_id not in page_source:
raise ValueError(f'Failed to retrieve information for order {order_id}')
# extract order date
def get_date(source, order_id):
# code blocks taken from beancount-import/amazon-invoice.py
soup=bs4.BeautifulSoup(source, 'lxml')
def is_order_placed_node(node):
# order placed information in page header (top left)
m = re.fullmatch(self.domain.regular_order_placed, node.text.strip())
return m is not None
def is_digital_order_row(node):
# information in heading of order table
if node.name != 'tr':
return False
m = re.match(self.domain.digital_order, node.text.strip())
if m is None:
return False
try:
self.domain.parse_date(m.group(1))
return True
except:
return False
if order_id.startswith('D'):
# digital order
node = soup.find(is_digital_order_row)
regex = self.domain.digital_order
else:
# regular order
node = soup.find(is_order_placed_node)
regex = self.domain.regular_order_placed
m = re.fullmatch(regex, node.text.strip())
if m is None:
return None
order_date = self.domain.parse_date(m.group(1))
return order_date
order_date = get_date(page_source, order_id)
if order_date is None:
if self.dir_per_year:
raise ValueError(f'Failed to get date for order {order_id}')
else:
# date is not necessary, so just log
logger.info(f'Failed to get date for order {order_id}')
else:
order_date = order_date.year
invoice_path = self.get_invoice_path(order_date, order_id)
if not os.path.exists(os.path.dirname(invoice_path)):
os.makedirs(os.path.dirname(invoice_path))
with atomic_write(
invoice_path, mode='w', encoding='utf-8',
newline='\n', overwrite=True) as f:
# Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
f.write('\ufeff' + page_source)
logger.info(' Wrote %s', invoice_path)
def run(self):
self.login()
if not os.path.exists(self.output_directory):
os.makedirs(self.output_directory)
self.get_orders(
regular=self.regular,
digital_orders_menu=self.digital_orders_menu
)
def run(**kwargs):
scrape_lib.run_with_scraper(Scraper, **kwargs)
def interactive(**kwargs):
return scrape_lib.interact_with_scraper(Scraper, **kwargs)