Merge pull request #28 from jameshwc/great-hw

v1.6.0: Download great homeworks
jameshwc · Feb 23, 2022 · 5bb157b · 5bb157b
2 parents a3cdad5 + c5afb74
commit 5bb157b
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 10 deletions.
diff --git a/ceiba/crawler.py b/ceiba/crawler.py
@@ -42,7 +42,7 @@ def crawl(self) -> Path:
             if util.is_relative_to(Crawler.crawled_urls[self.url], self.path):
                 logging.debug(strings.url_duplicate.format(self.url))
                 return Crawler.crawled_urls[self.url]
-        
+
         response = util.get(self.session, self.url)
         if response.status_code == 404 or response.content.startswith(
                 bytes('<html><head><title>Request Rejected</title>', encoding='utf-8')):
@@ -62,18 +62,19 @@ def crawl(self) -> Path:
         soup = BeautifulSoup(response.content, 'html.parser')
         self.download_css(soup.find_all('link'))
         self.download_imgs(soup.find_all('img'))
-
 
         if self.module == 'board':
             self.__handle_board(soup.find_all('caption'))  # special case for board
         elif self.module == 'bulletin':
             soup = self.__handle_bulletin(soup, response.url)
-
+        elif self.module == 'hw':
+            soup = self.__handle_hw(soup, response.url)
+
         soup = self.crawl_hrefs(soup, response.url)
 
         for op in soup.find_all('option'):
             op.extract()  # TODO: we should use <a> to replace <option>
-        
+
         filepath.write_text(str(soup), encoding='utf-8')
         return filepath
 
@@ -103,7 +104,7 @@ def crawl_hrefs(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
             skip_href_texts = util.board_skip_href_texts
         elif self.module == 'student':
             skip_href_texts = util.student_skip_href_texts
-        
+
         hrefs = soup.find_all('a')
         a: Tag
         for a in hrefs:
@@ -115,15 +116,15 @@ def crawl_hrefs(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
                 continue
             filename = a.text
             text = a.text
-            
+
             if self.module == 'vote' and a.get('href') == "#" and a.get('onclick'):
                 m = re.search(r"window\.open\(\'(.*?)\'.*", a.get('onclick'))
                 if m:
                     url = urljoin(resp_url, m.group(1))
                     del a['onclick']
                 filename = a.parent.parent.find_all('td')[1].text.strip()
                 text = filename
-            
+
             crawler_path = self.path
             if self._is_board and a.text in self._board_dir:
                 crawler_path = self._board_dir[a.text]
@@ -153,7 +154,7 @@ def __handle_board(self, captions: List[Tag]):
                     self._board_dir[a_tag.text].mkdir(exist_ok=True)
                 self._is_board = True
                 break
-    
+
     def __handle_bulletin(self, soup: BeautifulSoup, resp_url: str):
         op: Tag
         for op in soup.find_all('option'):
@@ -167,6 +168,24 @@ def __handle_bulletin(self, soup: BeautifulSoup, resp_url: str):
             select.replaceWithChildren()
         return soup
 
+    def __handle_hw(self, soup: BeautifulSoup, resp_url: str) -> BeautifulSoup:
+        great_hw_buttons: List[Tag] = soup.find_all('input', {'class': 'btn'})
+        for great_hw_button in great_hw_buttons:
+            onclick_val = great_hw_button.get('onclick')
+            if onclick_val is None:
+                continue
+            m = re.search(r"hw_view\('(.*)','(.*)\'\)", onclick_val)
+            if m and m.group(1) and m.group(2):
+                lang, hw_sn = m.group(1), m.group(2)
+                great_hw_button['href'] = urljoin(resp_url, f'hw_view.php?current_lang={lang}+"&hw_sn={hw_sn}')
+                great_hw_button.name = 'a'
+                great_hw_button.string = great_hw_button['value']
+                for attr in ['type', 'value', 'onclick', 'class']:
+                    del great_hw_button[attr]
+            else:
+                continue
+        return soup
+
     def download_imgs(self, imgs: ResultSet):
         img: Tag
         for img in imgs:
@@ -201,7 +220,7 @@ def __save_files(self, content: bytes) -> Path:
         Crawler.crawled_files_path.add(filepath)
         Crawler.crawled_urls[self.url] = filepath
         return filepath
-        
+
     def __get_uniq_filepath(self, path: Path):
         if path not in Crawler.crawled_files_path:
             return path

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-1.5.1
+1.6.0