Skip to content

Commit

Permalink
feat: 새로운 게시글 스크래핑 후 역색인 및 BM25업데이트, 비동기 및 재시도
Browse files Browse the repository at this point in the history
- BM25 업데이트 작업을 비동기 메서드로 분리하여 실행 (`updateBM25Async`)
- 실패 시 최대 3회 재시도하도록 @retryable 적용, 재시도 간격 2초 설정
- CompletableFuture.runAsync() 사용하여 비동기 실행 후 thenRun()으로 후속 처리 추가
  • Loading branch information
inpink committed Nov 9, 2024
1 parent dedb84b commit 6a6608d
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 53 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ dependencies {
implementation 'org.json:json:20190722'
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'
implementation 'net.sourceforge.tess4j:tess4j:4.5.4'
implementation 'org.springframework.retry:spring-retry'
implementation 'org.springframework:spring-aspects'

testImplementation 'junit:junit:4.13.1'
compileOnly 'org.projectlombok:lombok'
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/knusearch/clear/ClearApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;
import org.springframework.retry.annotation.EnableRetry;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
@EnableRetry
public class ClearApplication {

public static void main(String[] args) throws Exception {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/knusearch/clear/jpa/domain/site/MainBoard.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ public enum MainBoard implements Board {

@Override
public String getName() {
return null;
return name;
}

@Override
public String getEncryptedName() {
return null;
return encryptedName;
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package knusearch.clear.jpa.repository.post;

import java.util.List;
import knusearch.clear.jpa.domain.post.PostTerm;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.stereotype.Repository;

@Repository
Expand Down
43 changes: 29 additions & 14 deletions src/main/java/knusearch/clear/jpa/service/ScrapingService.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package knusearch.clear.jpa.service;


import jakarta.persistence.Tuple;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
Expand All @@ -14,6 +15,7 @@
import knusearch.clear.jpa.domain.site.Board;
import knusearch.clear.jpa.domain.site.Site;
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.jpa.service.post.CheckPostResult;
import knusearch.clear.util.ImageDownloader;
import knusearch.clear.util.OCRProcessor;
import lombok.RequiredArgsConstructor;
Expand Down Expand Up @@ -228,8 +230,6 @@ private String extractText(String imageUrl) throws Exception {
}
}


//글자수가 len*4 Byte를 초과하는 경우 cut하기.
public String cutString(String text, int byteSize) {
int koreanLen = byteSize / 4;
if (text != null && text.length() > koreanLen) {
Expand All @@ -240,48 +240,63 @@ public String cutString(String text, int byteSize) {
return text;
}

public void scrapeYesterdayPosts(Site site) {
public List<BasePost> scrapeYesterdayPosts(Site site) {
List<BasePost> basePosts = new ArrayList<>();
String baseUrl = site.getBaseUrl();
List<Board> boards = site.getBoards();

final LocalDate yesterday = LocalDate.now().minusDays(1);

for (Board board : boards) {
String postUrl = board.getEncryptedName();
savePostsWithinPeriod(baseUrl, postUrl, yesterday);
List<BasePost> newPosts = savePostsWithinPeriod(baseUrl, postUrl);
basePosts.addAll(newPosts);
}

return basePosts;
}

@Transactional
public void savePostsWithinPeriod(String baseUrl, String postUrl, LocalDate yesterday) {
public List<BasePost> savePostsWithinPeriod(String baseUrl, String postUrl) {
List<BasePost> newPosts = new ArrayList<>();
int pageIdx = 1;
boolean isTimeToBreak = false;
while (isTimeToBreak) {

while (!isTimeToBreak) {
Elements links = getAllLinksFromOnePage(baseUrl, postUrl, pageIdx);
isTimeToBreak = checkWithinPeriodAndSave(baseUrl, postUrl, yesterday, links);
CheckPostResult checkPostResult = checkWithinPeriodAndSave(baseUrl, postUrl, links);
isTimeToBreak = checkPostResult.isShouldBreak();
newPosts.addAll(checkPostResult.getNewPosts());
pageIdx++;
}
return newPosts;
}

public boolean checkWithinPeriodAndSave(
private CheckPostResult checkWithinPeriodAndSave(
String baseUrl,
String postUrl,
LocalDate yesterday,
Elements links
) {
List<BasePost> newPosts = new ArrayList<>();
final LocalDate yesterday = LocalDate.now().minusDays(1);

for (Element linkElement : links) {
BasePost basePost = setURLValues(linkElement, baseUrl, postUrl);

Map<String, Object> predictResult = classificationService.predictClassification(basePost.getText() + basePost.getTitle());
Map<String, Object> predictResult = classificationService.predictClassification(
basePost.getText() + basePost.getTitle());
basePost.setClassification((String) predictResult.get("predictedClass"));

LocalDate dateTime = basePost.getDateTime();
log.info("yesterdat" + yesterday.getDayOfMonth());
log.info("dateTime" + dateTime.getDayOfMonth());

if (dateTime.isBefore(yesterday)) {
return true;
log.info("Time to break");
return new CheckPostResult(true, newPosts);
}

basePostRepository.save(basePost);
newPosts.add(basePost);
}
return false;
return new CheckPostResult(false, newPosts);
}
}
50 changes: 36 additions & 14 deletions src/main/java/knusearch/clear/jpa/service/post/BM25Service.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
import knusearch.clear.jpa.repository.post.BasePostRepository;
import knusearch.clear.jpa.repository.post.PostTermRepository;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

@Service
@Getter
@Slf4j
public class BM25Service {

private double k1 = 1.5; // BM25 조정 파라미터
Expand All @@ -29,10 +34,11 @@ public class BM25Service {

private List<BasePost> documents;

private BasePostRepository basePostRepository;
private final BasePostService basePostService;
private final PostTermRepository postTermRepository;

public BM25Service(BasePostRepository basePostRepository, PostTermRepository postTermRepository) {
public BM25Service(BasePostRepository basePostRepository, PostTermRepository postTermRepository,
BasePostService basePostService) {
List<BasePost> documents = basePostRepository.findAll();

List<PostTerm> postTerms = postTermRepository.findAll();
Expand All @@ -42,19 +48,13 @@ public BM25Service(BasePostRepository basePostRepository, PostTermRepository pos
}

postTerms.clear();
postTerms = null;

this.totalDocs = documents.size();
this.avgDocLength = calculateAvgDocLength(documents); // 평균 문서 길이 캐싱
this.docFreqs = calculateDocFreqs(documents); // 단어별 문서 빈도 계산
this.postTermRepository = postTermRepository;
this.documents = basePostRepository.findAll();

// System.out.println("totalDocs: " + totalDocs);
// System.out.println("avgDocLength: " + avgDocLength);

// System.out.println("docWords: " + docWords.get(1L));
// System.out.println("docWords: " + docWords.get(2L));
this.basePostService = basePostService;
}

// 문서의 평균 길이 계산 (캐싱해둠)
Expand Down Expand Up @@ -96,10 +96,13 @@ public double calculateBM25(BasePost doc, List<String> query) {

for (String word : query) {
int termFreq = termFrequency(word, doc); // 단어 빈도 계산
if (termFreq == 0) continue; // 문서에 단어가 없으면 건너뜀
if (termFreq == 0) {
continue; // 문서에 단어가 없으면 건너뜀
}

double idf = getCachedIDF(word); // IDF 캐싱 사용
score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength / avgDocLength))));
score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength
/ avgDocLength))));
}
// 시간 가중치를 BM25 점수에 곱해서 반영
return score * timeWeight;
Expand All @@ -114,10 +117,13 @@ public double calculateBM25WithAi(BasePost doc, List<String> query,

for (String word : query) {
int termFreq = termFrequency(word, doc); // 단어 빈도 계산
if (termFreq == 0) continue; // 문서에 단어가 없으면 건너뜀
if (termFreq == 0) {
continue; // 문서에 단어가 없으면 건너뜀
}

double idf = getCachedIDF(word); // IDF 캐싱 사용
score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength / avgDocLength))));
score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * (docLength
/ avgDocLength))));
}
// 시간 가중치를 BM25 점수에 곱해서 반영

Expand All @@ -138,7 +144,8 @@ private int termFrequency(String word, BasePost doc) {
// 캐싱된 문서 길이 반환
private double getCachedDocLength(BasePost doc) {
// System.out.println(doc.getId()+ "docLengthCache: " + docLengthCache.get(doc.getId()));
return docLengthCache.computeIfAbsent(doc.getId(), id -> (double) doc.getContent().length());
return docLengthCache.computeIfAbsent(doc.getId(),
id -> (double) doc.getContent().length());
}

// 캐싱된 IDF 값 반환
Expand All @@ -154,4 +161,19 @@ private double calculateIDF(String term) {
return idf;
}

@Retryable(
value = {BM25UpdateException.class},
maxAttempts = 3,
backoff = @Backoff(delay = 2000)
)
@Transactional
public void updateIndex(List<BasePost> basePosts) {
try {
documents.addAll(basePosts);
basePostService.tokenizeAndSaveBasePostTerms(basePosts);
} catch (Exception e) {
log.error("BM25 update failed, retrying...", e);
throw new BM25UpdateException("BM25 update failed", e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package knusearch.clear.jpa.service.post;

public class BM25UpdateException extends RuntimeException {
public BM25UpdateException(String message, Throwable cause) {
super(message, cause);
}
}
Loading

0 comments on commit 6a6608d

Please sign in to comment.