-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdynamic_web_scraper.py
91 lines (71 loc) · 3.54 KB
/
dynamic_web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from PIL import Image
import os
def download_image_from_url_to_dir(img_url, dir_path, image_index):
'''
Downloads image from a given url to dir_path directory.
Image name is the name of the dir + '_' + image index
'''
response = requests.get(img_url)
if response.status_code:
image_name = dir_path + '_' + str(image_index) + '.jpg'
image_path = os.path.join(dir_path, image_name)
fp = open(image_path, 'wb')
fp.write(response.content)
fp.close()
def scrape_images_from_url_to_dir(url, dir_path):
'''
Downloades all images from a page at a given url to directory dir_path.
If dir with given path doesn't exist it is made
url leads to page with dynamic content loading, so the page needs to be scrolled iteratively for all images to load
Images are inside a div with id: 'outer_page_{image_index}'
Image src is stored in img tag with class: 'absimg' in the 'src' attribute
'''
# Making a dir if it does not exist
if not os.path.exists(dir_path):
os.mkdir(dir_path)
service = webdriver.chrome.service.Service('C:\Windows\chromedriver.exe')
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging']) # Turns off some annoying warnings
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
image_index = 0
while(True):
image_index += 1
try:
outer_page_div = driver.find_element(By.XPATH, f"//div[@id='outer_page_{image_index}']")
outer_page_div.location_once_scrolled_into_view # Scrolling into the location of image div so the image loads
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, f"page{image_index}"))) # Wait for image to load
while True:
image_tag = outer_page_div.find_element(By.XPATH, f".//img[@class='absimg']")
image_source = image_tag.get_attribute('src')
if not image_source is None:
break
time.sleep(1) # Image was not yet loaded, wait for it to load
download_image_from_url_to_dir(image_source, dir_path, image_index)
except NoSuchElementException:
print(f'Page ended at index {image_index}')
break
def main():
urls = [
'https://www.scribd.com/fullscreen/140622754?access_key=key-abqPvSXwqinM1rGh9SsW&allow_share=true&escape=false&show_recommendations=false&view_mode=scroll',
'https://www.scribd.com/fullscreen/140621346?access_key=key-abqPvSXwqinM1rGh9SsW&allow_share=true&escape=false&show_recommendations=false&view_mode=scroll',
'https://www.scribd.com/fullscreen/140623957?access_key=key-abqPvSXwqinM1rGh9SsW&allow_share=true&escape=false&show_recommendations=false&view_mode=scroll',
'https://www.scribd.com/fullscreen/140625940?access_key=key-abqPvSXwqinM1rGh9SsW&allow_share=true&escape=false&show_recommendations=false&view_mode=scroll'
]
dir_paths = [
"44_derbi",
"45_tako_je_nast_TNT",
"46_povratak_superhika",
"47_superhikov_veliki_poduhvat"
]
for url, dir_path in zip(urls, dir_paths):
scrape_images_from_url_to_dir(url, dir_path)
if __name__ == "__main__":
main()