๋ด์ค ํฌ๋กค๋ง
์ธ๋ก ์ฌ/์นดํ
๊ณ ๋ฆฌ๋ณ ๋ด์ค ์์ง
- ๋ค์ํ ์นดํ
๊ณ ๋ฆฌ ์ง์: ์ ์น, ๊ฒฝ์ , ์ฌํ, ์ํ/๋ฌธํ, ์ธ๊ณ, IT/๊ณผํ ๋ฑ 6๊ฐ ์นดํ
๊ณ ๋ฆฌ
- ์ ๋ขฐํ ์ ์๋ ์ธ๋ก ์ฌ: ์ฐํฉ๋ด์ค, ๋์์ผ๋ณด, ์ค์์ผ๋ณด, ํ๊ฒจ๋ , ๊ฒฝํฅ์ ๋ฌธ, MBC, ํ์ด๋ธ์
๋ด์ค, ๊ตญ๋ฏผ์ผ๋ณด, ์์ธ๊ฒฝ์ , ํ๊ตญ์ผ๋ณด, ํค๋ด๋๊ฒฝ์ , YTN, ๋ฌธํ์ผ๋ณด, ์ค๋ง์ด๋ด์ค, SBS, KBS
- ์๋ ํฌ๋กค๋ง: Selenium WebDriver๋ฅผ ์ฌ์ฉํ ๋์ ์ฝํ
์ธ ํฌ๋กค๋ง
- ์ค์ผ์ค๋ง: ๋งค์ผ ์์ ์ ์๋์ผ๋ก ๋ด์ค ์๋ก๊ณ ์นจ
// ์นดํ
๊ณ ๋ฆฌ๋ณ ๋ด์ค ์์ง
private static final Map<Integer, String> CATEGORIES = Map.of(
105, "IT/๊ณผํ",
100, "์ ์น",
101, "๊ฒฝ์ ",
102, "์ฌํ",
103, "์ํ/๋ฌธํ",
104, "์ธ๊ณ"
);
// ํ์ฉ๋ ์ธ๋ก ์ฌ ํํฐ๋ง
private boolean isAllowedPress(String press) {
Set<String> allowedPresses = Set.of(
"์ฐํฉ๋ด์ค", "๋์์ผ๋ณด", "์ค์์ผ๋ณด", "ํ๊ฒจ๋ ", "๊ฒฝํฅ์ ๋ฌธ",
"MBC", "ํ์ด๋ธ์
๋ด์ค", "๊ตญ๋ฏผ์ผ๋ณด", "์์ธ๊ฒฝ์ ", "ํ๊ตญ์ผ๋ณด",
"ํค๋ด๋๊ฒฝ์ ", "YTN", "๋ฌธํ์ผ๋ณด", "์ค๋ง์ด๋ด์ค", "SBS", "KBS"
);
return allowedPresses.stream().anyMatch(p -> p.equalsIgnoreCase(press.trim()));
}
// ์นดํ
๊ณ ๋ฆฌ๋ณ ํฌ๋กค๋ง
private List<NewsDetail> crawlCategory(WebDriver driver, WebDriverWait wait,
int categoryId, String categoryName, int targetCount) {
List<NewsDetail> categoryNews = new ArrayList<>();
Set<String> collectedLinks = new HashSet<>();
String categoryUrl = "<https://news.naver.com/section/>" + categoryId;
driver.get(categoryUrl);
while (collectedLinks.size() < targetCount) {
List<WebElement> articles = driver.findElements(
By.cssSelector("#newsct > div.section_latest > div > div.section_latest_article._CONTENT_LIST._PERSIST_META > div > ul > li")
);
for (WebElement article : articles) {
if (collectedLinks.size() >= targetCount) break;
NewsDetail detail = extractArticleDetail(article, categoryId, categoryName);
if (detail != null && !collectedLinks.contains(detail.getLink())) {
collectedLinks.add(detail.getLink());
categoryNews.add(detail);
}
}
if (!clickMoreButton(driver, wait)) {
break;
}
}
return categoryNews;
}
์ค๋ณต ๊ธฐ์ฌ ๋ฐฉ์ง
- ๋งํฌ ๊ธฐ๋ฐ ์ค๋ณต ๊ฒ์ฌ: ๊ธฐ์ฌ URL์ ์ ๊ทํํ์ฌ ์ค๋ณต ๋ฐฉ์ง
- ๋ฐ์ดํฐ๋ฒ ์ด์ค ๋ ๋ฒจ ๊ฒ์ฆ: ์ ์ฅ ์
existsByLink() ๋ฉ์๋๋ก ์ค๋ณต ํ์ธ
- ๋ฉ๋ชจ๋ฆฌ ๋ ๋ฒจ ์ค๋ณต ๋ฐฉ์ง: ํฌ๋กค๋ง ์ค
Set<String>์ ์ฌ์ฉํ ์ค์๊ฐ ์ค๋ณต ์ฒดํฌ
- ๋งํฌ ์ ๊ทํ: URL ํ๋ผ๋ฏธํฐ ์ ๊ฑฐ ๋ฐ ์ฌ๋์ ์ ๋ฆฌ๋ก ์ ํํ ์ค๋ณต ๊ฒ์ฌ
// ์ค๋ณต ๊ธฐ์ฌ ๋ฐฉ์ง ๋ก์ง
public NewsArticle saveNewsArticle(NewsDetail newsDetail) {
if (newsDetail.getLink() == null || newsDetail.getLink().trim().isEmpty()) {
return null;
}
// ๋งํฌ ์ ๊ทํ
String normalizedLink = normalizeLink(newsDetail.getLink());
// ๋ฐ์ดํฐ๋ฒ ์ด์ค์์ ์ค๋ณต ํ์ธ
if (newsArticleRepository.existsByLink(normalizedLink)) {
return null; // ์ค๋ณต ๊ธฐ์ฌ๋ ์ ์ฅํ์ง ์์
}
NewsArticle newsArticle = NewsArticle.builder()
.categoryId(newsDetail.getCategoryId())
.categoryName(newsDetail.getCategoryName())
.press(newsDetail.getPress())
.title(newsDetail.getTitle())
.content(newsDetail.getContent())
.reporter(newsDetail.getReporter())
.date(newsDetail.getDate())
.link(normalizedLink)
.build();
return newsArticleRepository.save(newsArticle);
}
// ๋งํฌ ์ ๊ทํ
private String normalizeLink(String link) {
if (link == null) return "";
String normalized = link;
// URL ํ๋ผ๋ฏธํฐ ์ ๊ฑฐ
int paramIndex = link.indexOf('?');
if (paramIndex > 0) {
normalized = link.substring(0, paramIndex);
}
// ๋์ ์ฌ๋์ ์ ๊ฑฐ
if (normalized.endsWith("/")) {
normalized = normalized.substring(0, normalized.length() - 1);
}
return normalized.trim();
}
๊ธฐ์ ์ ๊ตฌํ
ํฌ๋กค๋ง ์์ง
- Selenium WebDriver: ๋์ ์ฝํ
์ธ ํฌ๋กค๋ง์ ์ํ ๋ธ๋ผ์ฐ์ ์๋ํ
- Chrome WebDriver: Chrome ๋ธ๋ผ์ฐ์ ๋ฅผ ํตํ ์์ ์ ์ธ ํฌ๋กค๋ง
- WebDriverWait: ์์ ๋ก๋ฉ ๋๊ธฐ๋ก ์์ ์ฑ ํฅ์
- Headless ๋ชจ๋: ์๋ฒ ํ๊ฒฝ์์์ ํฌ๋กค๋ง ์ต์ ํ
๋ฐ์ดํฐ ๊ด๋ฆฌ