Skip to content

Commit

Permalink
feat: 어제 새롭게 올라온 게시글만 스크래핑하도록 기능 추가
Browse files Browse the repository at this point in the history
  • Loading branch information
inpink committed Nov 9, 2024
1 parent 1e7717c commit dedb84b
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 21 deletions.
2 changes: 2 additions & 0 deletions src/main/java/knusearch/clear/ClearApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class ClearApplication {

public static void main(String[] args) throws Exception {
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/knusearch/clear/jpa/domain/site/Site.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import java.util.Arrays;
import java.util.List;
import lombok.Getter;

@Getter
public enum Site {
MAIN("메인", "https://web.kangnam.ac.kr/menu/", MainBoard.values()),
ICT("ICT", "https://sae.kangnam.ac.kr/menu/", IctBoard.values()),
Expand Down
67 changes: 59 additions & 8 deletions src/main/java/knusearch/clear/jpa/service/ScrapingService.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import knusearch.clear.constants.StringConstants;
import knusearch.clear.jpa.domain.post.BasePost;
import knusearch.clear.jpa.domain.site.Board;
import knusearch.clear.jpa.domain.site.Site;
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.util.ImageDownloader;
import knusearch.clear.util.OCRProcessor;
Expand Down Expand Up @@ -37,6 +40,7 @@ public class ScrapingService {
add("3"); //취창업 : 주로 교외. 취업, 창업 관련
}};
private final BasePostRepository basePostRepository;
private final ClassificationService classificationService;

@Transactional
public String makeFinalPostListUrl(String baseUrl, String postUrl, int pageIdx) {
Expand All @@ -45,10 +49,10 @@ public String makeFinalPostListUrl(String baseUrl, String postUrl, int pageIdx)

@Transactional
public String makeFinalPostUrl(String baseUrl, String postUrl,
boolean scrtWrtiYn, String encMenuSeq, String encMenuBoardSeq) {
boolean scrtWrtiYn, String encMenuSeq, String encMenuBoardSeq) {
return baseUrl + "board/info/" + postUrl
+ "?scrtWrtiYn=" + scrtWrtiYn + "&encMenuSeq="
+ encMenuSeq + "&encMenuBoardSeq=" + encMenuBoardSeq;
+ "?scrtWrtiYn=" + scrtWrtiYn + "&encMenuSeq="
+ encMenuSeq + "&encMenuBoardSeq=" + encMenuBoardSeq;
}

//@Transactional : 트랜잭션 생성.
Expand Down Expand Up @@ -82,13 +86,14 @@ public int totalPageIdx(String url) { //하나의 게시판에서 모든 페이
}

@Transactional
public Elements GetAllLinksFromOnePage(String baseUrl, String postUrl, int pageIdx) { //하나의 페이지에서 모든 게시물들 링크뽑아냄
public Elements getAllLinksFromOnePage(String baseUrl, String postUrl,
int pageIdx) { //하나의 페이지에서 모든 게시물들 링크뽑아냄
//전체를 담을 List (현재 사용 X)
//List<BasePost> postList = new ArrayList<>();

try {
Document document = Jsoup.connect(
makeFinalPostListUrl(baseUrl, postUrl, pageIdx)).get();
makeFinalPostListUrl(baseUrl, postUrl, pageIdx)).get();

// 게시물 목록에서 각 게시물의 URL을 추출
Element div1 = document.select(".sec_inner").first();
Expand All @@ -106,7 +111,8 @@ public Elements GetAllLinksFromOnePage(String baseUrl, String postUrl, int pageI
}

@Transactional
public void setURLValues(BasePost basePost, Element linkElement, String baseUrl, String postUrl) {
public BasePost setURLValues(Element linkElement, String baseUrl, String postUrl) {
BasePost basePost = new BasePost();
String dataParams = linkElement.attr("data-params");
/*System.out.println("dataParams"+dataParams);*/

Expand All @@ -120,14 +126,15 @@ public void setURLValues(BasePost basePost, Element linkElement, String baseUrl,
String encMenuBoardSeq = jsonObject.getString("encMenuBoardSeq");

// 최종 URL을 생성
String finalURL = makeFinalPostUrl(baseUrl, postUrl, scrtWrtiYn, encMenuSeq, encMenuBoardSeq);
String finalURL = makeFinalPostUrl(baseUrl, postUrl, scrtWrtiYn, encMenuSeq,
encMenuBoardSeq);

basePost.setUrl(finalURL);
basePost.setScrtWrtiYn(scrtWrtiYn);
basePost.setEncryptedMenuSequence(encMenuSeq);
basePost.setEncryptedMenuBoardSequence(encMenuBoardSeq);
//System.out.println("finalURL = " + finalURL);

return basePost;
}

@Transactional
Expand Down Expand Up @@ -233,4 +240,48 @@ public String cutString(String text, int byteSize) {
return text;
}

public void scrapeYesterdayPosts(Site site) {
String baseUrl = site.getBaseUrl();
List<Board> boards = site.getBoards();

final LocalDate yesterday = LocalDate.now().minusDays(1);

for (Board board : boards) {
String postUrl = board.getEncryptedName();
savePostsWithinPeriod(baseUrl, postUrl, yesterday);
}
}

@Transactional
public void savePostsWithinPeriod(String baseUrl, String postUrl, LocalDate yesterday) {
int pageIdx = 1;
boolean isTimeToBreak = false;
while (isTimeToBreak) {
Elements links = getAllLinksFromOnePage(baseUrl, postUrl, pageIdx);
isTimeToBreak = checkWithinPeriodAndSave(baseUrl, postUrl, yesterday, links);
pageIdx++;
}
}

public boolean checkWithinPeriodAndSave(
String baseUrl,
String postUrl,
LocalDate yesterday,
Elements links
) {
for (Element linkElement : links) {
BasePost basePost = setURLValues(linkElement, baseUrl, postUrl);

Map<String, Object> predictResult = classificationService.predictClassification(basePost.getText() + basePost.getTitle());
basePost.setClassification((String) predictResult.get("predictedClass"));

LocalDate dateTime = basePost.getDateTime();
if (dateTime.isBefore(yesterday)) {
return true;
}

basePostRepository.save(basePost);
}
return false;
}
}
25 changes: 12 additions & 13 deletions src/main/java/knusearch/clear/jpa/service/post/BasePostService.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import knusearch.clear.jpa.domain.post.BasePost;
import knusearch.clear.jpa.domain.post.PostTerm;
import knusearch.clear.jpa.domain.post.Term;
import knusearch.clear.jpa.domain.site.Board;
import knusearch.clear.jpa.domain.site.Site;
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.jpa.repository.post.PostTermJdbcRepository;
import knusearch.clear.jpa.repository.post.PostTermRepository;
Expand Down Expand Up @@ -204,9 +206,10 @@ public boolean containsSpecialCharacter(String text) {
@Transactional
public void crawlUpdate() { // crawl and make baseposts
String baseUrl = getBaseUrl();
String[] allPostUrl = getAllPostUrl();
List<Board> boards = getBoards();

for (String postUrl : allPostUrl) {
for (Board board : boards) {
String postUrl = board.getEncryptedName();
String firsNoticetUrl = scrapingService.makeFinalPostListUrl(baseUrl, postUrl, 1);
int totalPageIdx = scrapingService.totalPageIdx(firsNoticetUrl); //총 페이지수 구해옴

Expand All @@ -216,11 +219,10 @@ public void crawlUpdate() { // crawl and make baseposts
//굳이 안받아와도 되긴할듯 필요하면 받아오고 //상속관계를 이용하여 BaseContent로 통일!
//추상화를 통해 DIP(의존관계역전) 적용된 케이스임
//List<BasePost> contentList = scrapeWebPage(baseUrl, postUrl ,i); //10페이지에 있는 것 contentMain에 저장시킴?
Elements links = scrapingService.GetAllLinksFromOnePage(baseUrl, postUrl, i);
Elements links = scrapingService.getAllLinksFromOnePage(baseUrl, postUrl, i);

for (Element linkElement : links) {
BasePost basePost = new BasePost();
scrapingService.setURLValues(basePost, linkElement, baseUrl, postUrl);
BasePost basePost = scrapingService.setURLValues(linkElement, baseUrl, postUrl);

checkAndSave(basePost);
}
Expand Down Expand Up @@ -361,13 +363,10 @@ public List<BasePostClassifyResponse> findBasePostsNotInClassifications(List<Str
}

public String getBaseUrl() {
return "https://web.kangnam.ac.kr/menu/";
//return Site.findBaseUrl(basePost.get);
} // TODO:

public String[] getAllPostUrl() {
return new String[]{"f19069e6134f8f8aa7f689a4a675e66f.do",
"e4058249224f49ab163131ce104214fb.do"};
//공지사항, 행사/안내 등
return Site.MAIN.getBaseUrl();
}

public List<Board> getBoards() {
return Site.MAIN.getBoards();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package knusearch.clear.jpa.service.post;

import knusearch.clear.jpa.domain.site.Site;
import knusearch.clear.jpa.service.ScrapingService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

@Component
@Slf4j
public class PostScrapingTask {

private final ScrapingService scrapingService;

public PostScrapingTask(ScrapingService scrapingService) {
this.scrapingService = scrapingService;
}

@Scheduled(cron = "0 0 0 * * *")
public void performTask() {
scrapingService.scrapeYesterdayPosts(Site.MAIN);
}
}

0 comments on commit dedb84b

Please sign in to comment.