From 8d805f665cec1fa177b3ce71141fc2bf741fb423 Mon Sep 17 00:00:00 2001 From: inpink Date: Sun, 10 Nov 2024 00:19:29 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20=EC=83=88=EB=A1=9C=EC=9A=B4=20=EA=B2=8C?= =?UTF-8?q?=EC=8B=9C=EA=B8=80=20=EC=8A=A4=ED=81=AC=EB=9E=98=ED=95=91=20?= =?UTF-8?q?=ED=9B=84=20=EC=97=AD=EC=83=89=EC=9D=B8=20=EB=B0=8F=20BM25?= =?UTF-8?q?=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8,=20=EB=B9=84=EB=8F=99?= =?UTF-8?q?=EA=B8=B0=20=EB=B0=8F=20=EC=9E=AC=EC=8B=9C=EB=8F=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - BM25 업데이트 작업을 비동기 메서드로 분리하여 실행 (`updateBM25Async`) - 실패 시 최대 3회 재시도하도록 @Retryable 적용, 재시도 간격 2초 설정 - CompletableFuture.runAsync() 사용하여 비동기 실행 후 thenRun()으로 후속 처리 추가 --- build.gradle | 2 + .../knusearch/clear/ClearApplication.java | 2 + .../repository/post/PostTermRepository.java | 2 - .../clear/jpa/service/ScrapingService.java | 26 ++++-- .../clear/jpa/service/post/BM25Service.java | 50 +++++++---- .../jpa/service/post/BM25UpdateException.java | 7 ++ .../jpa/service/post/BasePostService.java | 83 +++++++++++++++---- .../jpa/service/post/CheckPostResult.java | 14 ++++ .../jpa/service/post/PostScrapingTask.java | 12 +-- 9 files changed, 155 insertions(+), 43 deletions(-) create mode 100644 src/main/java/knusearch/clear/jpa/service/post/BM25UpdateException.java create mode 100644 src/main/java/knusearch/clear/jpa/service/post/CheckPostResult.java diff --git a/build.gradle b/build.gradle index 9d58df1..8bccb7e 100644 --- a/build.gradle +++ b/build.gradle @@ -38,6 +38,8 @@ dependencies { implementation 'org.json:json:20190722' implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch' implementation 'net.sourceforge.tess4j:tess4j:4.5.4' + implementation 'org.springframework.retry:spring-retry' + implementation 'org.springframework:spring-aspects' testImplementation 'junit:junit:4.13.1' compileOnly 'org.projectlombok:lombok' diff --git a/src/main/java/knusearch/clear/ClearApplication.java b/src/main/java/knusearch/clear/ClearApplication.java index ee85fbc..3aa2fcb 100644 --- a/src/main/java/knusearch/clear/ClearApplication.java +++ b/src/main/java/knusearch/clear/ClearApplication.java @@ -3,10 +3,12 @@ import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.context.ApplicationContext; +import org.springframework.retry.annotation.EnableRetry; import org.springframework.scheduling.annotation.EnableScheduling; @SpringBootApplication @EnableScheduling +@EnableRetry public class ClearApplication { public static void main(String[] args) throws Exception { diff --git a/src/main/java/knusearch/clear/jpa/repository/post/PostTermRepository.java b/src/main/java/knusearch/clear/jpa/repository/post/PostTermRepository.java index b2af4f8..e2f2952 100644 --- a/src/main/java/knusearch/clear/jpa/repository/post/PostTermRepository.java +++ b/src/main/java/knusearch/clear/jpa/repository/post/PostTermRepository.java @@ -1,9 +1,7 @@ package knusearch.clear.jpa.repository.post; -import java.util.List; import knusearch.clear.jpa.domain.post.PostTerm; import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.data.jpa.repository.Query; import org.springframework.stereotype.Repository; @Repository diff --git a/src/main/java/knusearch/clear/jpa/service/ScrapingService.java b/src/main/java/knusearch/clear/jpa/service/ScrapingService.java index e7a3bab..a485317 100644 --- a/src/main/java/knusearch/clear/jpa/service/ScrapingService.java +++ b/src/main/java/knusearch/clear/jpa/service/ScrapingService.java @@ -1,6 +1,7 @@ package knusearch.clear.jpa.service; +import jakarta.persistence.Tuple; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.ArrayList; @@ -14,6 +15,7 @@ import knusearch.clear.jpa.domain.site.Board; import knusearch.clear.jpa.domain.site.Site; import knusearch.clear.jpa.repository.post.BasePostRepository; +import knusearch.clear.jpa.service.post.CheckPostResult; import knusearch.clear.util.ImageDownloader; import knusearch.clear.util.OCRProcessor; import lombok.RequiredArgsConstructor; @@ -240,7 +242,8 @@ public String cutString(String text, int byteSize) { return text; } - public void scrapeYesterdayPosts(Site site) { + public List scrapeYesterdayPosts(Site site) { + List basePosts = new ArrayList<>(); String baseUrl = site.getBaseUrl(); List boards = site.getBoards(); @@ -248,27 +251,35 @@ public void scrapeYesterdayPosts(Site site) { for (Board board : boards) { String postUrl = board.getEncryptedName(); - savePostsWithinPeriod(baseUrl, postUrl, yesterday); + List newPosts = savePostsWithinPeriod(baseUrl, postUrl, yesterday); + basePosts.addAll(newPosts); } + + return basePosts; } @Transactional - public void savePostsWithinPeriod(String baseUrl, String postUrl, LocalDate yesterday) { + public List savePostsWithinPeriod(String baseUrl, String postUrl, LocalDate yesterday) { + List newPosts = new ArrayList<>(); int pageIdx = 1; boolean isTimeToBreak = false; while (isTimeToBreak) { Elements links = getAllLinksFromOnePage(baseUrl, postUrl, pageIdx); - isTimeToBreak = checkWithinPeriodAndSave(baseUrl, postUrl, yesterday, links); + CheckPostResult checkPostResult = checkWithinPeriodAndSave(baseUrl, postUrl, yesterday, links); + isTimeToBreak = checkPostResult.isShouldBreak(); + newPosts.addAll(checkPostResult.getNewPosts()); pageIdx++; } + return newPosts; } - public boolean checkWithinPeriodAndSave( + private CheckPostResult checkWithinPeriodAndSave( String baseUrl, String postUrl, LocalDate yesterday, Elements links ) { + List newPosts = new ArrayList<>(); for (Element linkElement : links) { BasePost basePost = setURLValues(linkElement, baseUrl, postUrl); @@ -277,11 +288,12 @@ public boolean checkWithinPeriodAndSave( LocalDate dateTime = basePost.getDateTime(); if (dateTime.isBefore(yesterday)) { - return true; + return new CheckPostResult(true, newPosts); } basePostRepository.save(basePost); + newPosts.add(basePost); } - return false; + return new CheckPostResult(false, newPosts); } } diff --git a/src/main/java/knusearch/clear/jpa/service/post/BM25Service.java b/src/main/java/knusearch/clear/jpa/service/post/BM25Service.java index 14be41c..4f2847e 100644 --- a/src/main/java/knusearch/clear/jpa/service/post/BM25Service.java +++ b/src/main/java/knusearch/clear/jpa/service/post/BM25Service.java @@ -11,10 +11,15 @@ import knusearch.clear.jpa.repository.post.BasePostRepository; import knusearch.clear.jpa.repository.post.PostTermRepository; import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; @Service @Getter +@Slf4j public class BM25Service { private double k1 = 1.5; // BM25 조정 파라미터 @@ -29,10 +34,11 @@ public class BM25Service { private List documents; - private BasePostRepository basePostRepository; + private final BasePostService basePostService; private final PostTermRepository postTermRepository; - public BM25Service(BasePostRepository basePostRepository, PostTermRepository postTermRepository) { + public BM25Service(BasePostRepository basePostRepository, PostTermRepository postTermRepository, + BasePostService basePostService) { List documents = basePostRepository.findAll(); List postTerms = postTermRepository.findAll(); @@ -42,19 +48,13 @@ public BM25Service(BasePostRepository basePostRepository, PostTermRepository pos } postTerms.clear(); - postTerms = null; this.totalDocs = documents.size(); this.avgDocLength = calculateAvgDocLength(documents); // 평균 문서 길이 캐싱 this.docFreqs = calculateDocFreqs(documents); // 단어별 문서 빈도 계산 this.postTermRepository = postTermRepository; this.documents = basePostRepository.findAll(); - -// System.out.println("totalDocs: " + totalDocs); -// System.out.println("avgDocLength: " + avgDocLength); - -// System.out.println("docWords: " + docWords.get(1L)); -// System.out.println("docWords: " + docWords.get(2L)); + this.basePostService = basePostService; } // 문서의 평균 길이 계산 (캐싱해둠) @@ -96,10 +96,13 @@ public double calculateBM25(BasePost doc, List query) { for (String word : query) { int termFreq = termFrequency(word, doc); // 단어 빈도 계산 - if (termFreq == 0) continue; // 문서에 단어가 없으면 건너뜀 + if (termFreq == 0) { + continue; // 문서에 단어가 없으면 건너뜀 + } double idf = getCachedIDF(word); // IDF 캐싱 사용 - score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength / avgDocLength)))); + score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength + / avgDocLength)))); } // 시간 가중치를 BM25 점수에 곱해서 반영 return score * timeWeight; @@ -114,10 +117,13 @@ public double calculateBM25WithAi(BasePost doc, List query, for (String word : query) { int termFreq = termFrequency(word, doc); // 단어 빈도 계산 - if (termFreq == 0) continue; // 문서에 단어가 없으면 건너뜀 + if (termFreq == 0) { + continue; // 문서에 단어가 없으면 건너뜀 + } double idf = getCachedIDF(word); // IDF 캐싱 사용 - score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength / avgDocLength)))); + score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength + / avgDocLength)))); } // 시간 가중치를 BM25 점수에 곱해서 반영 @@ -138,7 +144,8 @@ private int termFrequency(String word, BasePost doc) { // 캐싱된 문서 길이 반환 private double getCachedDocLength(BasePost doc) { // System.out.println(doc.getId()+ "docLengthCache: " + docLengthCache.get(doc.getId())); - return docLengthCache.computeIfAbsent(doc.getId(), id -> (double) doc.getContent().length()); + return docLengthCache.computeIfAbsent(doc.getId(), + id -> (double) doc.getContent().length()); } // 캐싱된 IDF 값 반환 @@ -154,4 +161,19 @@ private double calculateIDF(String term) { return idf; } + @Retryable( + value = {BM25UpdateException.class}, + maxAttempts = 3, + backoff = @Backoff(delay = 2000) + ) + @Transactional + public void updateIndex(List basePosts) { + try { + documents.addAll(basePosts); + basePostService.tokenizeAndSaveBasePostTerms(basePosts); + } catch (Exception e) { + log.error("BM25 update failed, retrying...", e); + throw new BM25UpdateException("BM25 update failed", e); + } + } } diff --git a/src/main/java/knusearch/clear/jpa/service/post/BM25UpdateException.java b/src/main/java/knusearch/clear/jpa/service/post/BM25UpdateException.java new file mode 100644 index 0000000..3f755f7 --- /dev/null +++ b/src/main/java/knusearch/clear/jpa/service/post/BM25UpdateException.java @@ -0,0 +1,7 @@ +package knusearch.clear.jpa.service.post; + +public class BM25UpdateException extends RuntimeException { + public BM25UpdateException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/knusearch/clear/jpa/service/post/BasePostService.java b/src/main/java/knusearch/clear/jpa/service/post/BasePostService.java index 54c252a..01022cb 100644 --- a/src/main/java/knusearch/clear/jpa/service/post/BasePostService.java +++ b/src/main/java/knusearch/clear/jpa/service/post/BasePostService.java @@ -8,6 +8,7 @@ import java.util.Optional; import java.util.Set; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.stream.Collectors; @@ -19,7 +20,6 @@ import knusearch.clear.jpa.domain.site.Site; import knusearch.clear.jpa.repository.post.BasePostRepository; import knusearch.clear.jpa.repository.post.PostTermJdbcRepository; -import knusearch.clear.jpa.repository.post.PostTermRepository; import knusearch.clear.jpa.repository.post.TermJdbcRepository; import knusearch.clear.jpa.repository.post.TermRepository; import knusearch.clear.jpa.service.ScrapingService; @@ -32,6 +32,9 @@ import org.springframework.data.domain.Page; import org.springframework.data.domain.PageRequest; import org.springframework.data.domain.Pageable; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import scala.collection.Seq; @@ -53,7 +56,6 @@ public class BasePostService { // 스레드 안전한 ConcurrentHashMap을 사용하여 Term 캐싱 private final ConcurrentMap termCache = new ConcurrentHashMap<>(); - private final PostTermRepository postTermRepository; private final PostTermJdbcRepository postTermJdbcRepository; private final TermJdbcRepository termJdbcRepository; @@ -66,31 +68,75 @@ public class BasePostService { @Transactional public void saveAllTermPosts() { List basePosts = basePostRepository.findAll(); + tokenizeAndSaveBasePostTerms(basePosts); + } - // PostTerm을 모아둘 리스트 + public void tokenizeAndSaveBasePostTerms(List basePosts) { List postTermsBatch = new ArrayList<>(); for (BasePost post : basePosts) { List postTerms = saveTermPost(post); - - // PostTerm을 배치에 추가 postTermsBatch.addAll(postTerms); - // 배치 크기만큼 쌓였을 때 bulk insert 실행 if (postTermsBatch.size() >= BATCH_SIZE) { - postTermJdbcRepository.saveAll(postTermsBatch); - postTermsBatch.clear(); // 배치 완료 후 리스트 비움 + // postTermsBatch의 복사본을 사용하여 비동기 작업을 동시에 실행 + List batchCopy = new ArrayList<>(postTermsBatch); + savePostTermsAsync(batchCopy); + updateBM25Async(batchCopy); + + postTermsBatch.clear(); } } - // 남아있는 PostTerm이 있으면 마저 insert 실행 if (!postTermsBatch.isEmpty()) { - postTermJdbcRepository.saveAll(postTermsBatch); + List batchCopy = new ArrayList<>(postTermsBatch); + savePostTermsAsync(batchCopy); + updateBM25Async(batchCopy); } } + // 1번 작업: PostTerm을 저장 (비동기 및 재시도) + @Async + @Retryable( + value = {Exception.class}, + maxAttempts = 3, + backoff = @Backoff(delay = 2000) + ) + public CompletableFuture savePostTermsAsync(List postTermsBatch) { + return CompletableFuture.runAsync(() -> { + try { + postTermJdbcRepository.saveAll(postTermsBatch); + } catch (Exception e) { + log.error("Failed to save post terms, retrying...", e); + throw new RuntimeException(e); + } + }); + } + + // 2번 작업: BM25 업데이트 (비동기 및 재시도) + @Async + @Retryable( + value = {Exception.class}, + maxAttempts = 3, + backoff = @Backoff(delay = 2000) + ) + public CompletableFuture updateBM25Async(List postTermsBatch) { + return CompletableFuture.runAsync(() -> { + try { + // TODO: 캐시 업데이트 + } catch (Exception e) { + log.error("Failed to update BM25, retrying...", e); + throw new RuntimeException(e); + } + }).thenRun(() -> { + // TODO: 후속 작업을 이곳에 추가 + log.info("BM25 update completed successfully for batch."); + }); + } + /** * 게시글을 저장하고, 해당 게시글의 단어를 추출하여 저장 + * * @param post 게시글 객체 * @return 저장할 PostTerm 리스트 */ @@ -160,6 +206,7 @@ private List saveTermPost(BasePost post) { /** * 게시글의 내용에서 형태소 분석을 통해 단어를 추출 + * * @param content 게시글 내용 * @return 단어 집합 */ @@ -176,7 +223,8 @@ public Set extractTermsFromContent(String content) { // 각 토큰을 Term으로 변환하여 Set에 추가 OpenKoreanTextProcessorJava.tokensToJavaKoreanTokenList(tokens).forEach(token -> { // 조건: 명사이면서, 길이가 1보다 크고, 특수문자가 포함되지 않은 경우 - if ((token.getPos().toString().equals("Noun") || token.getPos().toString().equals("ProperNoun")) + if ((token.getPos().toString().equals("Noun") || token.getPos().toString() + .equals("ProperNoun")) && token.getText().length() > 1 && !containsSpecialCharacter(token.getText())) { @@ -191,6 +239,7 @@ public Set extractTermsFromContent(String content) { /** * 주어진 문자열에 특수문자가 포함되어 있는지 확인하는 메서드 + * * @param text 확인할 텍스트 * @return 특수문자가 포함되어 있으면 true, 아니면 false */ @@ -239,7 +288,8 @@ public void checkAndSave(BasePost basePost) { String encMenuBoardSeq = basePost.getEncryptedMenuBoardSequence(); //DB에 없는 것만 추가!!! - if (basePostRepository.findAllByEncryptedMenuSequenceAndEncryptedMenuBoardSequence(encMenuSeq, encMenuBoardSeq).size() == 0) { + if (basePostRepository.findAllByEncryptedMenuSequenceAndEncryptedMenuBoardSequence( + encMenuSeq, encMenuBoardSeq).size() == 0) { scrapingService.setPostValues(basePost); System.out.println(basePost.getTitle()); // 추출한 데이터를 MySQL 데이터베이스에 저장하는 코드 추가 @@ -310,7 +360,8 @@ public List findByQuery(String query, int option) { return transformToClassifyResponse(basePosts); } - private static List transformToClassifyResponse(List basePosts) { + private static List transformToClassifyResponse( + List basePosts) { return basePosts.stream() .map(basePost -> new BasePostClassifyResponse( basePost.getId(), @@ -329,7 +380,8 @@ public void updateClassification(BasePost basePost, String classification) { } @Transactional - public void updateClassification(String query, int option, String except, String classification) { + public void updateClassification(String query, int option, String except, + String classification) { List posts = new ArrayList<>(); if (except.isEmpty()) { posts = findAllByQuery(query, option); @@ -354,7 +406,8 @@ public void updateClassification(String query, int option, String except, String } @Transactional - public List findBasePostsNotInClassifications(List classifications) { + public List findBasePostsNotInClassifications( + List classifications) { List posts = basePostRepository.findBasePostsNotInClassifications( classifications, PageRequest.of(0, 5)); diff --git a/src/main/java/knusearch/clear/jpa/service/post/CheckPostResult.java b/src/main/java/knusearch/clear/jpa/service/post/CheckPostResult.java new file mode 100644 index 0000000..a3eeb59 --- /dev/null +++ b/src/main/java/knusearch/clear/jpa/service/post/CheckPostResult.java @@ -0,0 +1,14 @@ +package knusearch.clear.jpa.service.post; + +import java.util.List; +import knusearch.clear.jpa.domain.post.BasePost; +import lombok.Getter; +import lombok.RequiredArgsConstructor; + +@RequiredArgsConstructor +@Getter +public class CheckPostResult { + private final boolean shouldBreak; + private final List newPosts; + +} diff --git a/src/main/java/knusearch/clear/jpa/service/post/PostScrapingTask.java b/src/main/java/knusearch/clear/jpa/service/post/PostScrapingTask.java index b073b41..be00749 100644 --- a/src/main/java/knusearch/clear/jpa/service/post/PostScrapingTask.java +++ b/src/main/java/knusearch/clear/jpa/service/post/PostScrapingTask.java @@ -1,23 +1,25 @@ package knusearch.clear.jpa.service.post; +import java.util.List; +import knusearch.clear.jpa.domain.post.BasePost; import knusearch.clear.jpa.domain.site.Site; import knusearch.clear.jpa.service.ScrapingService; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; @Component +@RequiredArgsConstructor @Slf4j public class PostScrapingTask { private final ScrapingService scrapingService; - - public PostScrapingTask(ScrapingService scrapingService) { - this.scrapingService = scrapingService; - } + private final BM25Service bm25Service; @Scheduled(cron = "0 0 0 * * *") public void performTask() { - scrapingService.scrapeYesterdayPosts(Site.MAIN); + List basePosts = scrapingService.scrapeYesterdayPosts(Site.MAIN); + bm25Service.updateIndex(basePosts); } }