Skip to content

Commit

Permalink
feat: 적절한 로그 출력 추가
Browse files Browse the repository at this point in the history
- 사용하지 않는 주석 및 import 제거
- 함수명 crawl에서 scrape으로 변경하여 의미 명확화
  • Loading branch information
inpink committed Nov 9, 2024
1 parent 1ba7d96 commit 1e7717c
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 35 deletions.
4 changes: 2 additions & 2 deletions src/main/java/knusearch/clear/InitDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import knusearch.clear.jpa.domain.Search;
import knusearch.clear.jpa.domain.SearchSite;
import knusearch.clear.jpa.domain.post.BasePost;
import knusearch.clear.jpa.service.CrawlService;
import knusearch.clear.jpa.service.ScrapingService;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
Expand All @@ -30,7 +30,7 @@ public void init() {
static class InitService {

private final EntityManager em;
private final CrawlService crawlService;
private final ScrapingService scrapingService;


public void dbInit1() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import java.util.ArrayList;
import java.util.List;
import knusearch.clear.jpa.domain.post.PostTerm;
import knusearch.clear.jpa.domain.post.Term;
import lombok.RequiredArgsConstructor;
import org.springframework.jdbc.core.BatchPreparedStatementSetter;
import org.springframework.jdbc.core.JdbcTemplate;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.regex.Pattern;
import knusearch.clear.constants.StringConstants;
import knusearch.clear.jpa.domain.post.BasePost;
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.util.ImageDownloader;
import knusearch.clear.util.OCRProcessor;
import lombok.RequiredArgsConstructor;
Expand All @@ -27,14 +28,15 @@
@Transactional(readOnly = true)
@RequiredArgsConstructor
@Slf4j
public class CrawlService {
public class ScrapingService {

private static final List<String> classifications = new ArrayList<>() {{
add("0"); //학사 : 학사 공지
add("1"); //장학 : 장학 : 장학금 주는 것
add("2"); //학습/상담 : 학습/상담 : 주로 교내. 학습 지원, 상담 지원
add("3"); //취창업 : 주로 교외. 취업, 창업 관련
}};
private final BasePostRepository basePostRepository;

@Transactional
public String makeFinalPostListUrl(String baseUrl, String postUrl, int pageIdx) {
Expand All @@ -60,7 +62,6 @@ public int totalPageIdx(String url) { //하나의 게시판에서 모든 페이
Element spanElement = div1.select("span").first();

String spanText = spanElement.text();
System.out.println("spanText = " + spanText);

// "/"를 기준으로 문자열을 분할
String[] parts = spanText.split("/");
Expand All @@ -70,7 +71,7 @@ public int totalPageIdx(String url) { //하나의 게시판에서 모든 페이
int totalPageIdx = Integer.parseInt(numberPart);

// 결과 출력
System.out.println("Extracted Number: " + totalPageIdx);
log.info("Extracted Number: " + totalPageIdx);
return totalPageIdx;
} catch (Exception e) {
// 예외 처리
Expand All @@ -82,7 +83,6 @@ public int totalPageIdx(String url) { //하나의 게시판에서 모든 페이

@Transactional
public Elements GetAllLinksFromOnePage(String baseUrl, String postUrl, int pageIdx) { //하나의 페이지에서 모든 게시물들 링크뽑아냄

//전체를 담을 List (현재 사용 X)
//List<BasePost> postList = new ArrayList<>();

Expand Down Expand Up @@ -197,21 +197,6 @@ public void setPostValues(BasePost basePost) { //하나의 게시물에서 제
}
}

private String decideClassification(String title, String cutText, Scanner scanner) throws Exception {
System.out.println("title = " + title);
System.out.println("cutText = " + cutText);

for (int i = 0; i < 10; i++) { //10번 try
String clas = scanner.next();
if (classifications.contains(clas)) {
return clas;
}
System.out.println("없는 class를 입력하였습니다.");
}

throw new Exception("class를 정하지 못했습니다.");
}

private String extractText(String imageUrl) throws Exception {
if (imageUrl == null) {
return "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import knusearch.clear.jpa.repository.post.PostTermRepository;
import knusearch.clear.jpa.repository.post.TermJdbcRepository;
import knusearch.clear.jpa.repository.post.TermRepository;
import knusearch.clear.jpa.service.CrawlService;
import knusearch.clear.jpa.service.ScrapingService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
Expand All @@ -42,7 +42,7 @@
public class BasePostService {

//공통되는 부분
private final CrawlService crawlService;
private final ScrapingService scrapingService;
// postService들 -> cralwSerivce 접근OK,
// 반대로 cralwSerivce -> postService는 절대 금지(순환참조). 순환참조는 하면 안 됨
private final BasePostRepository basePostRepository;
Expand Down Expand Up @@ -207,27 +207,24 @@ public void crawlUpdate() { // crawl and make baseposts
String[] allPostUrl = getAllPostUrl();

for (String postUrl : allPostUrl) {
System.out.println("postUrl = " + postUrl);
String firsNoticetUrl = crawlService.makeFinalPostListUrl(baseUrl, postUrl, 1);
int totalPageIdx = crawlService.totalPageIdx(firsNoticetUrl); //총 페이지수 구해옴
String firsNoticetUrl = scrapingService.makeFinalPostListUrl(baseUrl, postUrl, 1);
int totalPageIdx = scrapingService.totalPageIdx(firsNoticetUrl); //총 페이지수 구해옴

for (int i = 1; i <= totalPageIdx; i++) {
//for (int i = 1; i <= 2; i++) { //너무 많으니까 일단 10개정도로 테스트

//굳이 안받아와도 되긴할듯 필요하면 받아오고 //상속관계를 이용하여 BaseContent로 통일!
//추상화를 통해 DIP(의존관계역전) 적용된 케이스임
//List<BasePost> contentList = scrapeWebPage(baseUrl, postUrl ,i); //10페이지에 있는 것 contentMain에 저장시킴?
Elements links = crawlService.GetAllLinksFromOnePage(baseUrl, postUrl, i);
Elements links = scrapingService.GetAllLinksFromOnePage(baseUrl, postUrl, i);

for (Element linkElement : links) {
BasePost basePost = new BasePost();
crawlService.setURLValues(basePost, linkElement, baseUrl, postUrl);
scrapingService.setURLValues(basePost, linkElement, baseUrl, postUrl);

//TODO: Transcational을 없애고, 아래 하나 완료될 때마다 바로 저장되도록
checkAndSave(basePost);
}

System.out.println(i + "번째 페이지에 있는 모든 게시글 크롤링");
log.info(i + "번째 페이지에 있는 모든 게시글 크롤링");
}
}
}
Expand All @@ -241,7 +238,7 @@ public void checkAndSave(BasePost basePost) {

//DB에 없는 것만 추가!!!
if (basePostRepository.findAllByEncryptedMenuSequenceAndEncryptedMenuBoardSequence(encMenuSeq, encMenuBoardSeq).size() == 0) {
crawlService.setPostValues(basePost);
scrapingService.setPostValues(basePost);
System.out.println(basePost.getTitle());
// 추출한 데이터를 MySQL 데이터베이스에 저장하는 코드 추가
basePostRepository.save(basePost); //★
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import static org.junit.Assert.assertEquals;

import knusearch.clear.jpa.domain.post.BasePost;
import knusearch.clear.jpa.service.CrawlService;
import knusearch.clear.jpa.service.ScrapingService;
import knusearch.clear.jpa.service.post.BasePostService;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
Expand All @@ -15,7 +15,7 @@
public class BasePostServiceTest {

@Mock
private CrawlService crawlService;
private ScrapingService scrapingService;

@InjectMocks
private BasePostService basePostService;
Expand Down

0 comments on commit 1e7717c

Please sign in to comment.