1
0

feat: 适配squirrel改造

增加AI解析页面、优化解析流程
This commit is contained in:
2024-11-06 17:27:05 +08:00
parent 3bf399f7aa
commit 14e84a0d4c
13 changed files with 319 additions and 354 deletions

View File

@@ -24,7 +24,7 @@ class Article(
@Column(columnDefinition = "longtext") var html: String?,
var subtitle: String?,
@Column(columnDefinition = "longtext") var description: String?,
var score: Int?,
var tags: String?,
var createTime: Date?,
var pushed: Boolean?,
)
@@ -41,7 +41,7 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
@Query("update Article article set article.pushed = :pushed where article.id = :id")
fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean)
@Query("select article.id from Article article where article.description is not null and article.subtitle is not null and article.score is not null")
@Query("select article.id from Article article where article.description is not null and article.text is not null and article.text <> ''")
fun findAllId(): List<String>
@Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code")

View File

@@ -0,0 +1,153 @@
package com.lanyuanxiaoyao.digtal.market
import cn.hutool.core.date.DateUtil
import cn.hutool.crypto.SecureUtil
import com.lanyuanxiaoyao.squirrel.core.common.Management
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
import dev.failsafe.Failsafe
import dev.failsafe.FailsafeException
import dev.failsafe.RetryPolicy
import dev.failsafe.function.CheckedSupplier
import kotlin.time.Duration.Companion.seconds
import kotlin.time.toJavaDuration
import kotlinx.coroutines.runBlocking
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory
data class ArticleLink(
val code: String,
val url: String,
val hash: String,
val title: String?,
val datetime: String?,
)
private val logger = LoggerFactory.getLogger("Helper")
private val retryPolicy = RetryPolicy
.builder<Any>()
.withDelay(10.seconds.toJavaDuration())
.withMaxRetries(2)
.handleIf { e -> (e is PageParseException).not() }
.build()
@Suppress("UNCHECKED_CAST")
fun parseArticleLink(management: Management, code: String, url: String?, recursive: Boolean = false): List<ArticleLink> {
var next: String? = url
val links = mutableListOf<ArticleLink>()
do {
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, next!!) }
})
} catch (e: FailsafeException) {
logger.error("解析失败:${next}", e)
continue
}
(page["list"] as? List<Map<String, Any>>)
?.let { list ->
for (item in list) {
val link = item["link"] as String?
if (link.isNullOrBlank()) {
logger.warn("链接为空:{} {}", item["title"] as String?, next)
} else {
links.add(
ArticleLink(
code,
link,
SecureUtil.md5(link),
item["title"] as String?,
item["datetime"] as String?,
)
)
}
}
}
next = page["next"] as String?
} while (recursive && !next.isNullOrBlank())
return links
}
fun parseArticle(management: Management, articleLink: ArticleLink): Article? {
val (code, url, hash, title, datetime) = articleLink
return parseArticle(management, code, url, hash, title, datetime)
}
fun parseArticle(management: Management, code: String, url: String, hash: String, title: String?, datetime: String?): Article? {
logger.info("标题:{} 时间:{}", title, datetime)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, url) }
})
} catch (e: FailsafeException) {
logger.error("解析失败:${url}", e)
return null
}
(page["text"] as? Map<*, *>)?.let { text ->
var datetimeText = text["datetime"] as? String
val createTime = try {
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{13}")) == true) {
DateUtil
.date(datetimeText.toLong())
.toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil
.date(datetimeText.toLong() * 1000)
.toTimestamp()
} else {
DateUtil
.parse(
datetimeText,
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
}
} catch (e: Exception) {
logger.error("解析 $datetimeText 失败", e)
null
}
val sourceText = (text["source"] as? String) ?: ""
val source = if (sourceText.isBlank()) {
sourceText
} else {
sourceText.let {
val document = Jsoup.parse((it as String))
document
.select("script")
.forEach { it.remove() }
document
.select("link")
.forEach { it.remove() }
document.forEachNode { node -> node.removeAttr("style") }
document
.body()
.html()
}
}
return Article(
id = hash,
code = code,
url = url,
title = null,
author = null,
category = null,
text = text["content"] as String? ?: "",
html = source,
subtitle = null,
description = null,
tags = null,
createTime = createTime,
pushed = false,
)
} ?: return null
}

View File

@@ -11,11 +11,11 @@ import com.lanyuanxiaoyao.squirrel.core.common.Rule
import com.lanyuanxiaoyao.squirrel.core.common.Script
import com.lanyuanxiaoyao.squirrel.core.common.Selector
import com.lanyuanxiaoyao.squirrel.core.common.Site
import com.lanyuanxiaoyao.squirrel.core.common.Site.Option.Companion.OPEN_WITH_IFRAME
private val html = Pair("html", "true")
private val iframe = Pair("iframe", "true")
private val post = Pair("post", "true")
private val form = Pair("form", "true")
private val html = "html" to "true"
private val post = "post" to "true"
private val form = "form" to "true"
// language=regexp
private val commonRemove = listOf(
@@ -23,19 +23,29 @@ private val commonRemove = listOf(
"&.+?;"
)
private val sourceRule =
// language=regexp
"https*://.+" to Rule(
downloader = Downloader.Type.BROWSER,
parser = Parser.Type.CSS,
text = Content(
expression = "body",
content = Selector(":root"),
extra = mapOf(
"source" to Selector(":root", properties = mapOf(html))
),
)
)
private val 广东政务服务和数据管理局 = Site(
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
name = "广东政务服务和数据管理局",
home = "https://zfsg.gd.gov.cn",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.SEARCH,
downloader = Downloader.Type.BROWSER,
search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
rules = mapOf(
// language=regexp
"https://search.gd.gov.cn/api/search/all.*" to Rule(
downloader = Downloader.Type.HTTP,
parser = Parser.Type.JSON,
list = Content(
expression = "$.data.news.list",
@@ -57,71 +67,7 @@ private val 广东政务服务和数据管理局 = Site(
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
text = Content(
expression = ".Con",
title = Selector("h3.zw-title"),
author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
content = Selector(".zw"),
extra = mapOf(
"source" to Selector(".zw", properties = mapOf(html))
)
)
),
// language=regexp
"https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
text = Content(
expression = "#page-content",
title = Selector("#activity-name"),
author = Selector("#js_name"),
dateTime = Selector("#publish_time"),
content = Selector("#js_content"),
extra = mapOf(
"source" to Selector("#js_content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
text = Content(
expression = "#article-container",
title = Selector("#article-title"),
author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))),
dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))),
content = Selector("#article-content"),
extra = mapOf(
"source" to Selector("#article-content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
text = Content(
expression = "body",
title = Selector(".title-page .txt > span"),
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
content = Selector(".content"),
extra = mapOf(
"source" to Selector(".content", properties = mapOf(html))
)
)
),
sourceRule,
)
)
@@ -130,11 +76,8 @@ private val 深圳市政务服务和数据管理局 = Site(
name = "深圳市政务服务和数据管理局",
home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
icon = "https://www.sz.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
options = listOf(OPEN_WITH_IFRAME),
search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
rules = mapOf(
// language=regexp
@@ -179,20 +122,7 @@ private val 深圳市政务服务和数据管理局 = Site(
)
)
),
// language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
)
sourceRule,
)
)
@@ -201,12 +131,9 @@ private val 中华人民共和国中央人民政府 = Site(
name = "中华人民共和国中央人民政府",
home = "https://www.gov.cn",
icon = "https://www.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
options = listOf(OPEN_WITH_IFRAME),
rules = mapOf(
// language=regexp
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
@@ -216,10 +143,10 @@ private val 中华人民共和国中央人民政府 = Site(
"Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
"Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
),
properties = mapOf("post" to "true"),
properties = mapOf(post),
list = Content(
expression = "$.result.data.middle.list",
title = Selector("$.title_no_tag"),
title = Selector("$.title_no_tag", process = Process(remove = commonRemove)),
dateTime = Selector("$.time"),
link = Selector("$.url"),
),
@@ -236,51 +163,7 @@ private val 中华人民共和国中央人民政府 = Site(
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".policyLibraryOverview_content",
author = Selector("td:contains(源:) + td"),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content .article",
title = Selector("h1#ti"),
author = Selector(
".pages-date > .font",
process = Process(
default = "中华人民共和国中央人民政府",
remove = listOf("来源:")
)
),
dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".main-content",
title = Selector(".qa_content_box"),
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("\\s*源:"))),
content = Selector(".qa_content_text"),
extra = mapOf(
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
)
)
),
sourceRule,
)
)
@@ -289,12 +172,10 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
name = "中华人民共和国国家互联网信息办公室",
home = "https://www.cac.gov.cn",
icon = "https://www.cac.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
rules = mapOf(
// language=regexp
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
list = Content(
expression = ".xpage-container .list-item",
@@ -302,24 +183,9 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
dateTime = Selector(".search_time"),
link = Selector("a", "href", process = Process(prefix = "https:"))
),
next = Selector(
".xpage-pagination .xpage-pagination-next a:contains(下一页)",
"href",
Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")
)
next = Selector(".xpage-pagination .xpage-pagination-next a:contains(下一页)", "href", Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/"))
),
"https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule(
text = Content(
expression = ".main",
title = Selector("h1.title"),
author = Selector("#source", process = Process(remove = listOf("来源:"))),
dateTime = Selector("#pubtime"),
content = Selector(".main-content"),
extra = mapOf(
"source" to Selector(".main", properties = mapOf("html" to "true"))
)
)
)
sourceRule,
)
)
@@ -330,7 +196,6 @@ val 广州数据交易所 = Site(
home = "https://www.cantonde.com",
icon = "https://www.cantonde.com/favicon.ico",
description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。",
target = Site.Target.SEARCH,
parser = Parser.Type.JSON,
search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
rules = mapOf(
@@ -346,7 +211,7 @@ val 广州数据交易所 = Site(
Script(
Script.Type.Javascript,
// language=javascript
"return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`",
"return `https://www.cantonde.com/info.html#/infoDetail?id=\${text}`",
)
)
)
@@ -365,26 +230,16 @@ val 广州数据交易所 = Site(
)
),
),
// language=regexp
"https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule(
text = Content(
expression = "$.data",
title = Selector("$.TITLE"),
content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
author = Selector(process = Process(default = "广州数据交易所"))
),
properties = mapOf(post),
),
sourceRule,
),
)
val 北京市政务服务和数据管理局 = Site(
private val 北京市政务服务和数据管理局 = Site(
code = "cee7f242-668b-41fb-adbc-96fb27d4bf35",
name = "北京市政务服务和数据管理局",
author = "lanyuanxiaoyao",
home = "https://zwfwj.beijing.gov.cn",
icon = "https://zwfwj.beijing.gov.cn/favicon.ico",
target = Site.Target.SEARCH,
parser = Parser.Type.JSON,
search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20",
rules = mapOf(
@@ -420,18 +275,7 @@ val 北京市政务服务和数据管理局 = Site(
)
)
),
// language=regexp
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
parser = Parser.Type.CSS,
downloader = Downloader.Type.BROWSER,
text = Content(
expression = "#main .details_page",
title = Selector("h1"),
author = Selector(".article-info .ly", process = Process(remove = listOf("来源:"))),
dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间:"))),
content = Selector("#div_zhengwen")
)
)
sourceRule,
)
)

View File

@@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.digtal.market.controller
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.digtal.market.sites
import com.lanyuanxiaoyao.squirrel.core.common.Site
import jakarta.annotation.Resource
import org.slf4j.LoggerFactory
import org.springframework.data.domain.PageRequest
@@ -29,8 +30,10 @@ class OverviewController {
val result = articleRepository.findAll(request)
return mapOf(
"items" to result.content.map {
val iframe = (sites.firstOrNull { site -> site.code == it.code }?.properties?.get("iframe")?: "false").toBoolean()
val site = sites.find { site -> site.code == it.code }!!
val iframe = (site.options?.contains(Site.Option.OPEN_WITH_IFRAME) ?: false)
mapOf(
"name" to site.name,
"code" to it.code,
"title" to it.title,
"subtitle" to it.subtitle,
@@ -40,6 +43,7 @@ class OverviewController {
"createTime" to it.createTime,
"iframe" to iframe,
"category" to it.category,
"tags" to it.tags,
)
},
"total" to result.totalElements,

View File

@@ -1,7 +1,5 @@
package com.lanyuanxiaoyao.digtal.market.controller
import cn.hutool.json.JSON
import cn.hutool.json.JSONUtil
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
import jakarta.annotation.Resource
@@ -24,7 +22,7 @@ class SiteController {
private lateinit var management: JvmManagement
@GetMapping("list")
fun list(): List<Map<String, Any>> {
fun list(): List<Map<String, Any?>> {
val countMap = articleRepository.countGroupByCode().associate { it.key to it.count }
return management.exportSites()
.map { site ->
@@ -34,7 +32,7 @@ class SiteController {
"icon" to site.icon,
"url" to site.home,
"description" to site.description,
"iframe" to site.properties.containsKey("iframe"),
"iframe" to (site.properties?.containsKey("iframe") ?: false),
"news" to (countMap[site.code] ?: 0)
)
}

View File

@@ -1,25 +1,19 @@
package com.lanyuanxiaoyao.digtal.market.runner
import cn.hutool.core.date.DateUtil
import cn.hutool.core.util.NumberUtil
import cn.hutool.crypto.SecureUtil
import com.lanyuanxiaoyao.digtal.market.Article
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.digtal.market.keywords
import com.lanyuanxiaoyao.digtal.market.parseArticle
import com.lanyuanxiaoyao.digtal.market.parseArticleLink
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
import com.lanyuanxiaoyao.digtal.market.sites
import com.lanyuanxiaoyao.squirrel.core.common.Management
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
import dev.failsafe.Failsafe
import dev.failsafe.FailsafeException
import dev.failsafe.RetryPolicy
import dev.failsafe.function.CheckedSupplier
import jakarta.annotation.Resource
import java.util.concurrent.atomic.AtomicLong
import kotlin.time.Duration.Companion.seconds
import kotlin.time.toJavaDuration
import kotlinx.coroutines.runBlocking
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Service
@@ -54,12 +48,18 @@ class NewsRunner : Runner {
.forEach { site ->
logger.info("站点: {}", site.name)
keywords.forEach { keyword ->
val url = site.search.replace("{query}", keyword)
logger.info("类目: {}, 地址: {}", keyword, url)
val hashList = articleRepository.findAllId()
val links = parseArticleLink(site.code, url, false)
val url = site.search?.replace("{query}", keyword)
logger.info("正在搜索: {}, 地址: {}", keyword, url)
val links =
try {
parseArticleLink(management, site.code, url, false)
} catch (e: Exception) {
logger.error("解析失败 $url", e)
emptyList()
}
val total = links.size
val current = AtomicLong(0)
val hashList = articleRepository.findAllId()
links
.filter {
if (hashList.contains(it.hash)) {
@@ -68,128 +68,36 @@ class NewsRunner : Runner {
} else true
}
.forEach { link ->
Thread.sleep(500)
Thread.sleep(1000)
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
if (!article.text.isNullOrBlank()) {
val triple = descriptionService.parseDescription(article.text)
logger.info("小标题:{}", triple?.first)
logger.info("描述:{}", triple?.second)
logger.info("相关度:{}", triple?.third)
article.subtitle = triple?.first
article.description = triple?.second
article.score = triple?.third
}
try {
parseArticle(management, link)
?.let { article ->
if (!article.text.isNullOrBlank() || !article.html.isNullOrBlank()) {
val content = if (article.text.isNullOrBlank()) article.html else article.text
val triple = descriptionService.parseDescription(content)
logger.info("标题:{}", triple?.title)
logger.info("作者:{}", triple?.author)
logger.info("副标题:{}", triple?.subtitle)
logger.info("描述:{}", triple?.description)
logger.info("标签:{}", triple?.tags)
article.category = keyword
articleRepository.save(article)
article.title = if (link.title.isNullOrBlank()) triple?.title else link.title
article.author = if (article.author.isNullOrBlank()) triple?.author else article.author
article.subtitle = triple?.subtitle
article.description = triple?.description
article.tags = triple?.tags
}
article.category = keyword
articleRepository.save(article)
}
} catch (e: Exception) {
logger.error("解析失败 $link", e)
}
}
}
}
logger.info("本轮采集完成")
}
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
logger.info("Title: {} Datetime: {}", title, datetime)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, url) }
})
} catch (e: FailsafeException) {
logger.error("Parse failure", e)
return null
}
(page["text"] as? Map<*, *>)?.let { text ->
var datetimeText = text["datetime"] as? String
val createTime = try {
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil
.date(datetimeText.toLong())
.toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
DateUtil
.date(datetimeText.toLong() * 1000)
.toTimestamp()
} else {
DateUtil
.parse(
datetimeText,
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
}
} catch (e: Exception) {
logger.error("Parse $datetimeText error", e)
null
}
val source = text["source"]?.let {
val document = Jsoup.parse((it as String))
document
.select("script")
.forEach { node -> node.remove() }
document.forEachNode { node -> node.removeAttr("style") }
document
.body()
.html()
}
return Article(
id = hash,
code = code,
url = url,
title = text["title"] as String?,
author = text["author"] as String?,
category = null,
text = if ((text["content"] as String?) == null) null else text["content"] as String,
html = source,
subtitle = null,
description = null,
score = null,
createTime = createTime,
pushed = false,
)
} ?: return null
}
data class ArticleLink(val url: String, val hash: String, val title: String?, val datetime: String?)
fun parseArticleLink(code: String, url: String, recursive: Boolean = false): List<ArticleLink> {
var next: String? = url
val links = mutableListOf<ArticleLink>()
do {
logger.info("解析目录:{}", next)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, next!!) }
})
} catch (e: FailsafeException) {
logger.error("Parse failure", e)
continue
}
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
for (item in list) {
val title = item["title"] as String?
val datetime = item["datetime"] as String?
val link = item["link"] as String?
if (link.isNullOrBlank()) {
logger.warn("链接为空:{} {}", title, link)
} else {
links.add(ArticleLink(link, SecureUtil.md5(link), title, datetime))
}
}
}
next = page["next"] as String?
} while (recursive && !next.isNullOrBlank())
return links
}
}

View File

@@ -12,6 +12,14 @@ import kotlin.time.toJavaDuration
import org.slf4j.LoggerFactory
import org.springframework.stereotype.Service
data class Description(
val title: String,
val author: String,
val subtitle: String,
val description: String,
val tags: String,
)
@Service
class DescriptionService {
private val logger = LoggerFactory.getLogger(javaClass)
@@ -26,7 +34,7 @@ class DescriptionService {
QianfanChat(),
)
fun parseDescription(content: String?): Triple<String, String, Int>? {
fun parseDescription(content: String?): Description? {
return content?.let {
if (it.isNotBlank()) {
var description: String?
@@ -50,11 +58,16 @@ class DescriptionService {
.replace("```json", "")
.replace("```", "")
)
val subtitle = root.getByPath("title", String::class.java)
val title = root.getByPath("title", String::class.java)
val author = root.getByPath("author", String::class.java)
val subtitle = root.getByPath("subtitle", String::class.java)
val desc = root.getByPath("description", String::class.java)
val score = root.getByPath("score", Int::class.java)
return@let Triple(subtitle, desc, score)
val tags = root
.getByPath("tags", String::class.java)
.replace("", ",")
return@let Description(title, author, subtitle, desc, tags)
} catch (e: Throwable) {
logger.error("json解析失败", e)
continue
}
}

View File

@@ -5,7 +5,7 @@ function pagination() {
model: 'normal',
maxButtons: 10,
showPageInput: false,
perPageAvailable: [10, 15, 20],
perPageAvailable: [10, 15, 20, 50, 100],
activePage: '${page|default:1}',
total: '${total|default:0}',
className: 'text-right',
@@ -131,6 +131,11 @@ function overviewTab() {
className: 'text-current',
tpl: '${description}',
},
{
type: 'tpl',
className: 'text-blue-900 text-sm mt-2',
tpl: '${name}',
},
{
type: 'wrapper',
size: 'none',
@@ -152,10 +157,21 @@ function overviewTab() {
type: 'tag',
label: '${author}',
displayMode: 'rounded',
color: '#ff8888',
color: '#bd6464',
},
]
},
{
type: 'each',
className: 'mt-2',
source: "${SPLIT(tags, ',')}",
items: {
type: 'tag',
label: '${item}',
displayMode: 'rounded',
color: '#6b3481',
}
}
]
},
},

View File

@@ -48,7 +48,7 @@
]
}
}
let debug = true
let debug = false
let server = amis.embed(
'#root',
amisJSON,

View File

@@ -1,6 +1,7 @@
package com.lanyuanxiaoyao.digtal.market
import cn.hutool.core.date.DateUtil
import cn.hutool.core.io.FileUtil
import com.lanyuanxiaoyao.digtal.market.ai.Chat
import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat
import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
@@ -144,4 +145,18 @@ class Test {
)
)
}
@Test
fun testNativeDownload() {
val downloader = BasicDownloaderFactory().build(emptyMap())
val page = runBlocking {
downloader.download("http://zfsg.gd.gov.cn/xxfb/dtxw/content/post_4515949.html")
}
val root = Jsoup.parse(page).body()
root.select("script").forEach { it.remove() }
root.select("style").forEach { it.remove() }
root.select("link").forEach { it.remove() }
root.allElements.forEach { it.removeAttr("style") }
FileUtil.writeString(root.html(), "/Users/lanyuanxiaoyao/Project/IdeaProjects/digtal-market/source.txt", Charsets.UTF_8)
}
}

View File

@@ -24,18 +24,19 @@ class TestManagement {
@Test
fun testParse() {
newsRunner
.parseArticleLink(
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
"https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22数据要素%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
)
parseArticleLink(
management,
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
"https://www.gov.cn/zhengce/content/202409/content_6977766.htm",
)
.forEach { link ->
val article = newsRunner.parseArticle(
val article = parseArticle(
management,
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
link.url,
"",
link.title,
link.datetime,
"",
)
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
}
@@ -54,26 +55,26 @@ class TestManagement {
@Test
fun testParseList() {
newsRunner
.parseArticleLink(
"cee7f242-668b-41fb-adbc-96fb27d4bf35",
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
true,
)
parseArticleLink(
management,
"cee7f242-668b-41fb-adbc-96fb27d4bf35",
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
true,
)
// .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
.let { it.forEach { logger.info("{}", it.url) } }
}
@Test
fun testParseArticle() {
newsRunner
.parseArticle(
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
"2024-11-01 12:48:26",
"",
)
parseArticle(
management,
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
"",
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
"2024-11-01 12:48:26",
)
?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
}
}

View File

@@ -14,8 +14,8 @@ class TestRule {
private lateinit var management: Management
private val link =
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html"
private val site = 北京市政务服务和数据管理局
"https://www.gov.cn/zhengce/202410/content_6979047.htm"
private val site = sites.find { it.code == "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be" }!!
@BeforeTest
fun before() {

View File

@@ -70,4 +70,17 @@ Content-Type: application/json
}
### Search
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
### Search
POST https://www.cantonde.com/si/common/searchInfo
Content-Type: application/json
{
"NAME": "数据要素",
"IN_CATEGORY": "",
"NOT_IN_CATEGORY": "",
"CATEGORY": "",
"pageNo": 1,
"pageSize": 10
}