1
0

feat: 增加新站点采集

This commit is contained in:
2024-11-05 15:14:13 +08:00
parent de585e3016
commit bd67c9f830
3 changed files with 50 additions and 33 deletions

View File

@@ -56,9 +56,18 @@ class Application : ApplicationRunner, ApplicationListener<ContextClosedEvent> {
@Resource
private lateinit var pushRunner: PushRunner
@Resource
private lateinit var articleRepository: ArticleRepository
override fun run(args: ApplicationArguments?) {
// pushRunner.run()
// newsRunner.run()
sites
.filterNot { articleRepository.existsByCode(it.code) }
.forEach {
logger.info("发现新站点:{}", it.name)
newsRunner.run(it.code)
}
}
override fun onApplicationEvent(event: ContextClosedEvent) {

View File

@@ -34,6 +34,8 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
fun findAllByDescriptionIsNullAndTextIsNotNull(): List<Article>
fun findAllByHtmlIsNotNull(): List<Article>
fun existsByCode(code: String): Boolean
@Modifying
@Transactional
@Query("update Article article set article.pushed = :pushed where article.id = :id")

View File

@@ -45,42 +45,48 @@ class NewsRunner : Runner {
@Scheduled(cron = "0 0 6-18 * * ?")
override fun run() {
sites.forEach { site ->
logger.info("站点: {}", site.name)
keywords.forEach { keyword ->
val url = site.search.replace("{query}", keyword)
logger.info("类目: {}, 地址: {}", keyword, url)
val hashList = articleRepository.findAllId()
val links = parseArticleLink(site.code, url, false)
val total = links.size
val current = AtomicLong(0)
links
.filter {
if (hashList.contains(it.hash)) {
current.addAndGet(1)
false
} else true
}
.forEach { link ->
Thread.sleep(500)
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
if (!article.text.isNullOrBlank()) {
val triple = descriptionService.parseDescription(article.text)
logger.info("小标题:{}", triple?.first)
logger.info("描述:{}", triple?.second)
logger.info("相关度:{}", triple?.third)
article.subtitle = triple?.first
article.description = triple?.second
article.score = triple?.third
}
sites.forEach { run(it.code) }
}
article.category = keyword
articleRepository.save(article)
fun run(code: String) {
sites
.filter { it.code == code }
.forEach { site ->
logger.info("站点: {}", site.name)
keywords.forEach { keyword ->
val url = site.search.replace("{query}", keyword)
logger.info("类目: {}, 地址: {}", keyword, url)
val hashList = articleRepository.findAllId()
val links = parseArticleLink(site.code, url, false)
val total = links.size
val current = AtomicLong(0)
links
.filter {
if (hashList.contains(it.hash)) {
current.addAndGet(1)
false
} else true
}
}
.forEach { link ->
Thread.sleep(500)
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
if (!article.text.isNullOrBlank()) {
val triple = descriptionService.parseDescription(article.text)
logger.info("小标题:{}", triple?.first)
logger.info("描述:{}", triple?.second)
logger.info("相关度:{}", triple?.third)
article.subtitle = triple?.first
article.description = triple?.second
article.score = triple?.third
}
article.category = keyword
articleRepository.save(article)
}
}
}
}
}
logger.info("本轮采集完成")
}