feat: 增加新站点采集
This commit is contained in:
@@ -56,9 +56,18 @@ class Application : ApplicationRunner, ApplicationListener<ContextClosedEvent> {
|
||||
@Resource
|
||||
private lateinit var pushRunner: PushRunner
|
||||
|
||||
@Resource
|
||||
private lateinit var articleRepository: ArticleRepository
|
||||
|
||||
override fun run(args: ApplicationArguments?) {
|
||||
// pushRunner.run()
|
||||
// newsRunner.run()
|
||||
sites
|
||||
.filterNot { articleRepository.existsByCode(it.code) }
|
||||
.forEach {
|
||||
logger.info("发现新站点:{}", it.name)
|
||||
newsRunner.run(it.code)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onApplicationEvent(event: ContextClosedEvent) {
|
||||
|
||||
@@ -34,6 +34,8 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
|
||||
fun findAllByDescriptionIsNullAndTextIsNotNull(): List<Article>
|
||||
fun findAllByHtmlIsNotNull(): List<Article>
|
||||
|
||||
fun existsByCode(code: String): Boolean
|
||||
|
||||
@Modifying
|
||||
@Transactional
|
||||
@Query("update Article article set article.pushed = :pushed where article.id = :id")
|
||||
|
||||
@@ -45,42 +45,48 @@ class NewsRunner : Runner {
|
||||
|
||||
@Scheduled(cron = "0 0 6-18 * * ?")
|
||||
override fun run() {
|
||||
sites.forEach { site ->
|
||||
logger.info("站点: {}", site.name)
|
||||
keywords.forEach { keyword ->
|
||||
val url = site.search.replace("{query}", keyword)
|
||||
logger.info("类目: {}, 地址: {}", keyword, url)
|
||||
val hashList = articleRepository.findAllId()
|
||||
val links = parseArticleLink(site.code, url, false)
|
||||
val total = links.size
|
||||
val current = AtomicLong(0)
|
||||
links
|
||||
.filter {
|
||||
if (hashList.contains(it.hash)) {
|
||||
current.addAndGet(1)
|
||||
false
|
||||
} else true
|
||||
}
|
||||
.forEach { link ->
|
||||
Thread.sleep(500)
|
||||
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
|
||||
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
|
||||
if (!article.text.isNullOrBlank()) {
|
||||
val triple = descriptionService.parseDescription(article.text)
|
||||
logger.info("小标题:{}", triple?.first)
|
||||
logger.info("描述:{}", triple?.second)
|
||||
logger.info("相关度:{}", triple?.third)
|
||||
article.subtitle = triple?.first
|
||||
article.description = triple?.second
|
||||
article.score = triple?.third
|
||||
}
|
||||
sites.forEach { run(it.code) }
|
||||
}
|
||||
|
||||
article.category = keyword
|
||||
articleRepository.save(article)
|
||||
fun run(code: String) {
|
||||
sites
|
||||
.filter { it.code == code }
|
||||
.forEach { site ->
|
||||
logger.info("站点: {}", site.name)
|
||||
keywords.forEach { keyword ->
|
||||
val url = site.search.replace("{query}", keyword)
|
||||
logger.info("类目: {}, 地址: {}", keyword, url)
|
||||
val hashList = articleRepository.findAllId()
|
||||
val links = parseArticleLink(site.code, url, false)
|
||||
val total = links.size
|
||||
val current = AtomicLong(0)
|
||||
links
|
||||
.filter {
|
||||
if (hashList.contains(it.hash)) {
|
||||
current.addAndGet(1)
|
||||
false
|
||||
} else true
|
||||
}
|
||||
}
|
||||
.forEach { link ->
|
||||
Thread.sleep(500)
|
||||
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
|
||||
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
|
||||
if (!article.text.isNullOrBlank()) {
|
||||
val triple = descriptionService.parseDescription(article.text)
|
||||
logger.info("小标题:{}", triple?.first)
|
||||
logger.info("描述:{}", triple?.second)
|
||||
logger.info("相关度:{}", triple?.third)
|
||||
article.subtitle = triple?.first
|
||||
article.description = triple?.second
|
||||
article.score = triple?.third
|
||||
}
|
||||
|
||||
article.category = keyword
|
||||
articleRepository.save(article)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("本轮采集完成")
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user