Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
Merge pull request #28 from jameshwc/great-hw
Browse files Browse the repository at this point in the history
v1.6.0: Download great homeworks
  • Loading branch information
jameshwc authored Feb 23, 2022
2 parents a3cdad5 + c5afb74 commit 5bb157b
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
37 changes: 28 additions & 9 deletions ceiba/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def crawl(self) -> Path:
if util.is_relative_to(Crawler.crawled_urls[self.url], self.path):
logging.debug(strings.url_duplicate.format(self.url))
return Crawler.crawled_urls[self.url]

response = util.get(self.session, self.url)
if response.status_code == 404 or response.content.startswith(
bytes('<html><head><title>Request Rejected</title>', encoding='utf-8')):
Expand All @@ -62,18 +62,19 @@ def crawl(self) -> Path:
soup = BeautifulSoup(response.content, 'html.parser')
self.download_css(soup.find_all('link'))
self.download_imgs(soup.find_all('img'))


if self.module == 'board':
self.__handle_board(soup.find_all('caption')) # special case for board
elif self.module == 'bulletin':
soup = self.__handle_bulletin(soup, response.url)

elif self.module == 'hw':
soup = self.__handle_hw(soup, response.url)

soup = self.crawl_hrefs(soup, response.url)

for op in soup.find_all('option'):
op.extract() # TODO: we should use <a> to replace <option>

filepath.write_text(str(soup), encoding='utf-8')
return filepath

Expand Down Expand Up @@ -103,7 +104,7 @@ def crawl_hrefs(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
skip_href_texts = util.board_skip_href_texts
elif self.module == 'student':
skip_href_texts = util.student_skip_href_texts

hrefs = soup.find_all('a')
a: Tag
for a in hrefs:
Expand All @@ -115,15 +116,15 @@ def crawl_hrefs(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
continue
filename = a.text
text = a.text

if self.module == 'vote' and a.get('href') == "#" and a.get('onclick'):
m = re.search(r"window\.open\(\'(.*?)\'.*", a.get('onclick'))
if m:
url = urljoin(resp_url, m.group(1))
del a['onclick']
filename = a.parent.parent.find_all('td')[1].text.strip()
text = filename

crawler_path = self.path
if self._is_board and a.text in self._board_dir:
crawler_path = self._board_dir[a.text]
Expand Down Expand Up @@ -153,7 +154,7 @@ def __handle_board(self, captions: List[Tag]):
self._board_dir[a_tag.text].mkdir(exist_ok=True)
self._is_board = True
break

def __handle_bulletin(self, soup: BeautifulSoup, resp_url: str):
op: Tag
for op in soup.find_all('option'):
Expand All @@ -167,6 +168,24 @@ def __handle_bulletin(self, soup: BeautifulSoup, resp_url: str):
select.replaceWithChildren()
return soup

def __handle_hw(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
great_hw_buttons: List[Tag] = soup.find_all('input', {'class': 'btn'})
for great_hw_button in great_hw_buttons:
onclick_val = great_hw_button.get('onclick')
if onclick_val is None:
continue
m = re.search(r"hw_view\('(.*)','(.*)\'\)", onclick_val)
if m and m.group(1) and m.group(2):
lang, hw_sn = m.group(1), m.group(2)
great_hw_button['href'] = urljoin(resp_url, f'hw_view.php?current_lang={lang}+"&hw_sn={hw_sn}')
great_hw_button.name = 'a'
great_hw_button.string = great_hw_button['value']
for attr in ['type', 'value', 'onclick', 'class']:
del great_hw_button[attr]
else:
continue
return soup

def download_imgs(self, imgs: ResultSet):
img: Tag
for img in imgs:
Expand Down Expand Up @@ -201,7 +220,7 @@ def __save_files(self, content: bytes) -> Path:
Crawler.crawled_files_path.add(filepath)
Crawler.crawled_urls[self.url] = filepath
return filepath

def __get_uniq_filepath(self, path: Path):
if path not in Crawler.crawled_files_path:
return path
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.5.1
1.6.0

0 comments on commit 5bb157b

Please sign in to comment.