Skip to content

Commit

Permalink
feat: 새로운 게시글 스크래핑 후 역색인 및 BM25업데이트, 비동기 및 재시도
Browse files Browse the repository at this point in the history
- BM25 업데이트 작업을 비동기 메서드로 분리하여 실행 (`updateBM25Async`)
- 실패 시 최대 3회 재시도하도록 @retryable 적용, 재시도 간격 2초 설정
- CompletableFuture.runAsync() 사용하여 비동기 실행 후 thenRun()으로 후속 처리 추가
  • Loading branch information
inpink committed Nov 9, 2024
1 parent dedb84b commit 1dc71da
Show file tree
Hide file tree
Showing 12 changed files with 185 additions and 103 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ dependencies {
implementation 'org.json:json:20190722'
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'
implementation 'net.sourceforge.tess4j:tess4j:4.5.4'
implementation 'org.springframework.retry:spring-retry'
implementation 'org.springframework:spring-aspects'

testImplementation 'junit:junit:4.13.1'
compileOnly 'org.projectlombok:lombok'
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/knusearch/clear/ClearApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;
import org.springframework.retry.annotation.EnableRetry;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
@EnableRetry
public class ClearApplication {

public static void main(String[] args) throws Exception {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/knusearch/clear/jpa/domain/site/MainBoard.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ public enum MainBoard implements Board {

@Override
public String getName() {
return null;
return name;
}

@Override
public String getEncryptedName() {
return null;
return encryptedName;
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package knusearch.clear.jpa.repository.post;

import java.util.List;
import knusearch.clear.jpa.domain.post.PostTerm;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.stereotype.Repository;

@Repository
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import org.springframework.web.client.RestTemplate;

@Service
@Transactional(readOnly = true)
@RequiredArgsConstructor
@Slf4j
public class ClassificationService {
Expand All @@ -31,11 +30,16 @@ public class ClassificationService {
}};

@Transactional
public Map<String, Object> predictClassification(final String searchQuery) {
public Map<String, Object> predictClassification(String query) {
query = query.replaceAll("[^가-힣\\s]", ""); // 한글과 공백을 제외한 모든 문자를 제거
if (query.isEmpty()) {
throw new IllegalArgumentException("Query is empty");
}

// Flask 서버에 요청을 보내기 위한 데이터 구성
String flaskEndpoint = "http://13.209.132.169:5000/predict"; // Flask 서버의 URL
Map<String, String> requestBody = new HashMap<>();
requestBody.put("text", searchQuery);
requestBody.put("text", query);

// Flask 서버로 POST 요청을 보내고 응답 받기
ResponseEntity<String> response = restTemplate.postForEntity(flaskEndpoint, requestBody, String.class);
Expand Down
94 changes: 36 additions & 58 deletions src/main/java/knusearch/clear/jpa/service/ScrapingService.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package knusearch.clear.jpa.service;


import jakarta.persistence.Tuple;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
Expand All @@ -14,6 +15,7 @@
import knusearch.clear.jpa.domain.site.Board;
import knusearch.clear.jpa.domain.site.Site;
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.jpa.service.post.CheckPostResult;
import knusearch.clear.util.ImageDownloader;
import knusearch.clear.util.OCRProcessor;
import lombok.RequiredArgsConstructor;
Expand All @@ -24,11 +26,11 @@
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;


@Service
@Transactional(readOnly = true)
@RequiredArgsConstructor
@Slf4j
public class ScrapingService {
Expand Down Expand Up @@ -121,7 +123,7 @@ public BasePost setURLValues(Element linkElement, String baseUrl, String postUrl

// "scrtWrtiYn"와 "encMenuSeq"와 "encMenuBoardSeq" 값을 가져오기
// ↓기본값으로 false를 설정. 어떤 사이트에는 scrtWrtiYn값이 없다. scrtWrtiYn는 상위 노출되는 공지유무를 뜻함
Boolean scrtWrtiYn = jsonObject.optBoolean("scrtWrtiYn", false); //얘는 boolean
boolean scrtWrtiYn = jsonObject.optBoolean("scrtWrtiYn", false); //얘는 boolean
String encMenuSeq = jsonObject.getString("encMenuSeq");
String encMenuBoardSeq = jsonObject.getString("encMenuBoardSeq");

Expand All @@ -133,7 +135,6 @@ public BasePost setURLValues(Element linkElement, String baseUrl, String postUrl
basePost.setScrtWrtiYn(scrtWrtiYn);
basePost.setEncryptedMenuSequence(encMenuSeq);
basePost.setEncryptedMenuBoardSequence(encMenuBoardSeq);

return basePost;
}

Expand All @@ -147,11 +148,9 @@ public void setPostValues(BasePost basePost) { //하나의 게시물에서 제
// 원하는 div 요소 선택 (class가 "tbl_view"인 div를 선택)
Element divElement = document.select(".tblw_subj").first();
String title = divElement.text(); // div 내용 추출
//System.out.println("크롤링 제목:" + title);

Element divElement2 = document.select(".tbl_view").first();
String text = divElement2.text(); // div 내용 추출
/*System.out.println("크롤링 본문:" + text);*/

// 이미지 태그 선택
Elements imgElements = divElement2.select("img");
Expand All @@ -160,7 +159,6 @@ public void setPostValues(BasePost basePost) { //하나의 게시물에서 제
String imageSrc = null;
for (Element imgElement : imgElements) { //여러개면 여러개 다 뽑아냄. 일단 지금은 db에 마지막 1개만 담고있음
imageSrc = "https://web.kangnam.ac.kr" + imgElement.attr("src");
/*System.out.println("크롤링 본문의 이미지 소스 링크:" + "https://web.kangnam.ac.kr" + imageSrc);*/
}

//Date 추출. span으로 묶여있어서 파싱으로 Date 형식만 가져옴
Expand All @@ -175,27 +173,16 @@ public void setPostValues(BasePost basePost) { //하나의 게시물에서 제
if (matcher.find()) {
// 그룹 1에서 일치하는 문자열 가져오기
dateString = matcher.group(1);
/*System.out.println("크롤링 Date: " + dateTime);*/
}

DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy.MM.dd HH:mm");
LocalDate dateTime = LocalDate.parse(dateString, formatter);

/*// 분류 추출
Element divElement4 = document.select(".colum20").first();
String classification = divElement4.text().split(" ")[1]; // div 내용 추출
// div안에 다른 div가 있는 구조라, split으로 분리해서 classification명만 추출
*/

// 이미지에서 텍스트 추출
String extractedText = extractText(imageSrc);

Scanner scanner = new Scanner(System.in);
final String cutText = cutString(text, BasePost.TEXT_COLUMN_LENGTH);
basePost.setTitle(title);
basePost.setText(cutText);
basePost.setImage(cutString(imageSrc, BasePost.IMAGE_COLUMN_LENGTH));
basePost.setImageText(cutString(extractedText, BasePost.TEXT_COLUMN_LENGTH));
basePost.setDateTime(dateTime);
basePost.setClassification(StringConstants.UNDETERMINED.getDescription());
} catch (Exception e) {
Expand All @@ -204,32 +191,6 @@ public void setPostValues(BasePost basePost) { //하나의 게시물에서 제
}
}

private String extractText(String imageUrl) throws Exception {
if (imageUrl == null) {
return "";
}

// 임시 파일 이름 사용
String filename = "downloaded_image"; // 확장자 없음

// 이미지 다운로드
//System.out.println("imageUrl = " + imageUrl);
try {
ImageDownloader.downloadImage(imageUrl, filename);

// OCR을 사용하여 텍스트 추출
String extractedText = OCRProcessor.extractTextFromImage(filename + ".jpg");
//System.out.println("Extracted Text: " + extractedText);

return extractedText;
} catch (Exception e) {
System.out.println(e);
return "";
}
}


//글자수가 len*4 Byte를 초과하는 경우 cut하기.
public String cutString(String text, int byteSize) {
int koreanLen = byteSize / 4;
if (text != null && text.length() > koreanLen) {
Expand All @@ -240,48 +201,65 @@ public String cutString(String text, int byteSize) {
return text;
}

public void scrapeYesterdayPosts(Site site) {
public List<BasePost> scrapeYesterdayPosts(Site site) {
List<BasePost> basePosts = new ArrayList<>();
String baseUrl = site.getBaseUrl();
List<Board> boards = site.getBoards();

final LocalDate yesterday = LocalDate.now().minusDays(1);

for (Board board : boards) {
String postUrl = board.getEncryptedName();
savePostsWithinPeriod(baseUrl, postUrl, yesterday);
List<BasePost> newPosts = savePostsWithinPeriod(baseUrl, postUrl);
basePosts.addAll(newPosts);
}

return basePosts;
}

@Transactional
public void savePostsWithinPeriod(String baseUrl, String postUrl, LocalDate yesterday) {
@Transactional(readOnly = false, propagation = Propagation.REQUIRES_NEW)
public List<BasePost> savePostsWithinPeriod(String baseUrl, String postUrl) {
List<BasePost> newPosts = new ArrayList<>();
int pageIdx = 1;
boolean isTimeToBreak = false;
while (isTimeToBreak) {

while (!isTimeToBreak) {
Elements links = getAllLinksFromOnePage(baseUrl, postUrl, pageIdx);
isTimeToBreak = checkWithinPeriodAndSave(baseUrl, postUrl, yesterday, links);
CheckPostResult checkPostResult = checkWithinPeriodAndSave(baseUrl, postUrl, links);
isTimeToBreak = checkPostResult.isShouldBreak();
newPosts.addAll(checkPostResult.getNewPosts());
pageIdx++;
}
return newPosts;
}

public boolean checkWithinPeriodAndSave(
private CheckPostResult checkWithinPeriodAndSave(
String baseUrl,
String postUrl,
LocalDate yesterday,
Elements links
) {
List<BasePost> newPosts = new ArrayList<>();
final LocalDate yesterday = LocalDate.now().minusDays(1);

for (Element linkElement : links) {
BasePost basePost = setURLValues(linkElement, baseUrl, postUrl);
setPostValues(basePost);

Map<String, Object> predictResult = classificationService.predictClassification(basePost.getText() + basePost.getTitle());
Map<String, Object> predictResult = classificationService.predictClassification(
basePost.getText() + basePost.getTitle());
basePost.setClassification((String) predictResult.get("predictedClass"));

LocalDate dateTime = basePost.getDateTime();
if (dateTime.isBefore(yesterday)) {
return true;
}
log.info("yesterdat" + yesterday.getDayOfMonth());
log.info("dateTime" + dateTime.getDayOfMonth());

// if (dateTime.isBefore(yesterday)) {
// log.info("Time to break");
// return new CheckPostResult(true, newPosts);
// }

System.out.println("basePost = " + basePost.isScrtWrtiYn());
basePostRepository.save(basePost);
newPosts.add(basePost);
}
return false;
return new CheckPostResult(false, newPosts);
}
}
3 changes: 1 addition & 2 deletions src/main/java/knusearch/clear/jpa/service/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageImpl;
import org.springframework.data.domain.PageRequest;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.ui.Model;
import scala.collection.Seq;

@Primary
@Service
@Transactional(readOnly = true)
@Transactional
@RequiredArgsConstructor
public class SearchService {

Expand Down
Loading

0 comments on commit 1dc71da

Please sign in to comment.