๋‰ด์Šค ํฌ๋กค๋ง

์–ธ๋ก ์‚ฌ/์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋‰ด์Šค ์ˆ˜์ง‘

// ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋‰ด์Šค ์ˆ˜์ง‘
private static final Map<Integer, String> CATEGORIES = Map.of(
    105, "IT/๊ณผํ•™",
    100, "์ •์น˜",
    101, "๊ฒฝ์ œ",
    102, "์‚ฌํšŒ",
    103, "์ƒํ™œ/๋ฌธํ™”",
    104, "์„ธ๊ณ„"
);

// ํ—ˆ์šฉ๋œ ์–ธ๋ก ์‚ฌ ํ•„ํ„ฐ๋ง
private boolean isAllowedPress(String press) {
    Set<String> allowedPresses = Set.of(
        "์—ฐํ•ฉ๋‰ด์Šค", "๋™์•„์ผ๋ณด", "์ค‘์•™์ผ๋ณด", "ํ•œ๊ฒจ๋ ˆ", "๊ฒฝํ–ฅ์‹ ๋ฌธ",
        "MBC", "ํŒŒ์ด๋‚ธ์…œ๋‰ด์Šค", "๊ตญ๋ฏผ์ผ๋ณด", "์„œ์šธ๊ฒฝ์ œ", "ํ•œ๊ตญ์ผ๋ณด",
        "ํ—ค๋Ÿด๋“œ๊ฒฝ์ œ", "YTN", "๋ฌธํ™”์ผ๋ณด", "์˜ค๋งˆ์ด๋‰ด์Šค", "SBS", "KBS"
    );
    return allowedPresses.stream().anyMatch(p -> p.equalsIgnoreCase(press.trim()));
}

// ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํฌ๋กค๋ง
private List<NewsDetail> crawlCategory(WebDriver driver, WebDriverWait wait,
                                     int categoryId, String categoryName, int targetCount) {
    List<NewsDetail> categoryNews = new ArrayList<>();
    Set<String> collectedLinks = new HashSet<>();

    String categoryUrl = "<https://news.naver.com/section/>" + categoryId;
    driver.get(categoryUrl);

    while (collectedLinks.size() < targetCount) {
        List<WebElement> articles = driver.findElements(
            By.cssSelector("#newsct > div.section_latest > div > div.section_latest_article._CONTENT_LIST._PERSIST_META > div > ul > li")
        );

        for (WebElement article : articles) {
            if (collectedLinks.size() >= targetCount) break;

            NewsDetail detail = extractArticleDetail(article, categoryId, categoryName);
            if (detail != null && !collectedLinks.contains(detail.getLink())) {
                collectedLinks.add(detail.getLink());
                categoryNews.add(detail);
            }
        }

        if (!clickMoreButton(driver, wait)) {
            break;
        }
    }

    return categoryNews;
}

์ค‘๋ณต ๊ธฐ์‚ฌ ๋ฐฉ์ง€

// ์ค‘๋ณต ๊ธฐ์‚ฌ ๋ฐฉ์ง€ ๋กœ์ง
public NewsArticle saveNewsArticle(NewsDetail newsDetail) {
    if (newsDetail.getLink() == null || newsDetail.getLink().trim().isEmpty()) {
        return null;
    }

    // ๋งํฌ ์ •๊ทœํ™”
    String normalizedLink = normalizeLink(newsDetail.getLink());

    // ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์—์„œ ์ค‘๋ณต ํ™•์ธ
    if (newsArticleRepository.existsByLink(normalizedLink)) {
        return null; // ์ค‘๋ณต ๊ธฐ์‚ฌ๋Š” ์ €์žฅํ•˜์ง€ ์•Š์Œ
    }

    NewsArticle newsArticle = NewsArticle.builder()
            .categoryId(newsDetail.getCategoryId())
            .categoryName(newsDetail.getCategoryName())
            .press(newsDetail.getPress())
            .title(newsDetail.getTitle())
            .content(newsDetail.getContent())
            .reporter(newsDetail.getReporter())
            .date(newsDetail.getDate())
            .link(normalizedLink)
            .build();

    return newsArticleRepository.save(newsArticle);
}

// ๋งํฌ ์ •๊ทœํ™”
private String normalizeLink(String link) {
    if (link == null) return "";

    String normalized = link;

    // URL ํŒŒ๋ผ๋ฏธํ„ฐ ์ œ๊ฑฐ
    int paramIndex = link.indexOf('?');
    if (paramIndex > 0) {
        normalized = link.substring(0, paramIndex);
    }

    // ๋์˜ ์Šฌ๋ž˜์‹œ ์ œ๊ฑฐ
    if (normalized.endsWith("/")) {
        normalized = normalized.substring(0, normalized.length() - 1);
    }

    return normalized.trim();
}

๊ธฐ์ˆ ์  ๊ตฌํ˜„

ํฌ๋กค๋ง ์—”์ง„

๋ฐ์ดํ„ฐ ๊ด€๋ฆฌ