1
0

feat: 适配squirrel改造

增加AI解析页面、优化解析流程
This commit is contained in:
2024-11-06 17:27:05 +08:00
parent 3bf399f7aa
commit 14e84a0d4c
13 changed files with 319 additions and 354 deletions

View File

@@ -24,7 +24,7 @@ class Article(
@Column(columnDefinition = "longtext") var html: String?, @Column(columnDefinition = "longtext") var html: String?,
var subtitle: String?, var subtitle: String?,
@Column(columnDefinition = "longtext") var description: String?, @Column(columnDefinition = "longtext") var description: String?,
var score: Int?, var tags: String?,
var createTime: Date?, var createTime: Date?,
var pushed: Boolean?, var pushed: Boolean?,
) )
@@ -41,7 +41,7 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
@Query("update Article article set article.pushed = :pushed where article.id = :id") @Query("update Article article set article.pushed = :pushed where article.id = :id")
fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean) fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean)
@Query("select article.id from Article article where article.description is not null and article.subtitle is not null and article.score is not null") @Query("select article.id from Article article where article.description is not null and article.text is not null and article.text <> ''")
fun findAllId(): List<String> fun findAllId(): List<String>
@Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code") @Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code")

View File

@@ -0,0 +1,153 @@
package com.lanyuanxiaoyao.digtal.market
import cn.hutool.core.date.DateUtil
import cn.hutool.crypto.SecureUtil
import com.lanyuanxiaoyao.squirrel.core.common.Management
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
import dev.failsafe.Failsafe
import dev.failsafe.FailsafeException
import dev.failsafe.RetryPolicy
import dev.failsafe.function.CheckedSupplier
import kotlin.time.Duration.Companion.seconds
import kotlin.time.toJavaDuration
import kotlinx.coroutines.runBlocking
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory
data class ArticleLink(
val code: String,
val url: String,
val hash: String,
val title: String?,
val datetime: String?,
)
private val logger = LoggerFactory.getLogger("Helper")
private val retryPolicy = RetryPolicy
.builder<Any>()
.withDelay(10.seconds.toJavaDuration())
.withMaxRetries(2)
.handleIf { e -> (e is PageParseException).not() }
.build()
@Suppress("UNCHECKED_CAST")
fun parseArticleLink(management: Management, code: String, url: String?, recursive: Boolean = false): List<ArticleLink> {
var next: String? = url
val links = mutableListOf<ArticleLink>()
do {
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, next!!) }
})
} catch (e: FailsafeException) {
logger.error("解析失败:${next}", e)
continue
}
(page["list"] as? List<Map<String, Any>>)
?.let { list ->
for (item in list) {
val link = item["link"] as String?
if (link.isNullOrBlank()) {
logger.warn("链接为空:{} {}", item["title"] as String?, next)
} else {
links.add(
ArticleLink(
code,
link,
SecureUtil.md5(link),
item["title"] as String?,
item["datetime"] as String?,
)
)
}
}
}
next = page["next"] as String?
} while (recursive && !next.isNullOrBlank())
return links
}
fun parseArticle(management: Management, articleLink: ArticleLink): Article? {
val (code, url, hash, title, datetime) = articleLink
return parseArticle(management, code, url, hash, title, datetime)
}
fun parseArticle(management: Management, code: String, url: String, hash: String, title: String?, datetime: String?): Article? {
logger.info("标题:{} 时间:{}", title, datetime)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, url) }
})
} catch (e: FailsafeException) {
logger.error("解析失败:${url}", e)
return null
}
(page["text"] as? Map<*, *>)?.let { text ->
var datetimeText = text["datetime"] as? String
val createTime = try {
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{13}")) == true) {
DateUtil
.date(datetimeText.toLong())
.toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil
.date(datetimeText.toLong() * 1000)
.toTimestamp()
} else {
DateUtil
.parse(
datetimeText,
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
}
} catch (e: Exception) {
logger.error("解析 $datetimeText 失败", e)
null
}
val sourceText = (text["source"] as? String) ?: ""
val source = if (sourceText.isBlank()) {
sourceText
} else {
sourceText.let {
val document = Jsoup.parse((it as String))
document
.select("script")
.forEach { it.remove() }
document
.select("link")
.forEach { it.remove() }
document.forEachNode { node -> node.removeAttr("style") }
document
.body()
.html()
}
}
return Article(
id = hash,
code = code,
url = url,
title = null,
author = null,
category = null,
text = text["content"] as String? ?: "",
html = source,
subtitle = null,
description = null,
tags = null,
createTime = createTime,
pushed = false,
)
} ?: return null
}

View File

@@ -11,11 +11,11 @@ import com.lanyuanxiaoyao.squirrel.core.common.Rule
import com.lanyuanxiaoyao.squirrel.core.common.Script import com.lanyuanxiaoyao.squirrel.core.common.Script
import com.lanyuanxiaoyao.squirrel.core.common.Selector import com.lanyuanxiaoyao.squirrel.core.common.Selector
import com.lanyuanxiaoyao.squirrel.core.common.Site import com.lanyuanxiaoyao.squirrel.core.common.Site
import com.lanyuanxiaoyao.squirrel.core.common.Site.Option.Companion.OPEN_WITH_IFRAME
private val html = Pair("html", "true") private val html = "html" to "true"
private val iframe = Pair("iframe", "true") private val post = "post" to "true"
private val post = Pair("post", "true") private val form = "form" to "true"
private val form = Pair("form", "true")
// language=regexp // language=regexp
private val commonRemove = listOf( private val commonRemove = listOf(
@@ -23,19 +23,29 @@ private val commonRemove = listOf(
"&.+?;" "&.+?;"
) )
private val sourceRule =
// language=regexp
"https*://.+" to Rule(
downloader = Downloader.Type.BROWSER,
parser = Parser.Type.CSS,
text = Content(
expression = "body",
content = Selector(":root"),
extra = mapOf(
"source" to Selector(":root", properties = mapOf(html))
),
)
)
private val 广东政务服务和数据管理局 = Site( private val 广东政务服务和数据管理局 = Site(
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
name = "广东政务服务和数据管理局", name = "广东政务服务和数据管理局",
home = "https://zfsg.gd.gov.cn", home = "https://zfsg.gd.gov.cn",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.SEARCH,
downloader = Downloader.Type.BROWSER,
search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D", search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
"https://search.gd.gov.cn/api/search/all.*" to Rule( "https://search.gd.gov.cn/api/search/all.*" to Rule(
downloader = Downloader.Type.HTTP,
parser = Parser.Type.JSON, parser = Parser.Type.JSON,
list = Content( list = Content(
expression = "$.data.news.list", expression = "$.data.news.list",
@@ -57,71 +67,7 @@ private val 广东政务服务和数据管理局 = Site(
) )
) )
), ),
// language=regexp sourceRule,
"https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
text = Content(
expression = ".Con",
title = Selector("h3.zw-title"),
author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
content = Selector(".zw"),
extra = mapOf(
"source" to Selector(".zw", properties = mapOf(html))
)
)
),
// language=regexp
"https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
text = Content(
expression = "#page-content",
title = Selector("#activity-name"),
author = Selector("#js_name"),
dateTime = Selector("#publish_time"),
content = Selector("#js_content"),
extra = mapOf(
"source" to Selector("#js_content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
text = Content(
expression = "#article-container",
title = Selector("#article-title"),
author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))),
dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))),
content = Selector("#article-content"),
extra = mapOf(
"source" to Selector("#article-content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
text = Content(
expression = "body",
title = Selector(".title-page .txt > span"),
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
content = Selector(".content"),
extra = mapOf(
"source" to Selector(".content", properties = mapOf(html))
)
)
),
) )
) )
@@ -130,11 +76,8 @@ private val 深圳市政务服务和数据管理局 = Site(
name = "深圳市政务服务和数据管理局", name = "深圳市政务服务和数据管理局",
home = "https://www.sz.gov.cn/szzsj/gkmlpt/index", home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
icon = "https://www.sz.gov.cn/favicon.ico", icon = "https://www.sz.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.TEXT, options = listOf(OPEN_WITH_IFRAME),
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all", search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
@@ -179,20 +122,7 @@ private val 深圳市政务服务和数据管理局 = Site(
) )
) )
), ),
// language=regexp sourceRule,
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
)
) )
) )
@@ -201,12 +131,9 @@ private val 中华人民共和国中央人民政府 = Site(
name = "中华人民共和国中央人民政府", name = "中华人民共和国中央人民政府",
home = "https://www.gov.cn", home = "https://www.gov.cn",
icon = "https://www.gov.cn/favicon.ico", icon = "https://www.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D", search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
options = listOf(OPEN_WITH_IFRAME),
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
@@ -216,10 +143,10 @@ private val 中华人民共和国中央人民政府 = Site(
"Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D", "Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
"Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2", "Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
), ),
properties = mapOf("post" to "true"), properties = mapOf(post),
list = Content( list = Content(
expression = "$.result.data.middle.list", expression = "$.result.data.middle.list",
title = Selector("$.title_no_tag"), title = Selector("$.title_no_tag", process = Process(remove = commonRemove)),
dateTime = Selector("$.time"), dateTime = Selector("$.time"),
link = Selector("$.url"), link = Selector("$.url"),
), ),
@@ -236,51 +163,7 @@ private val 中华人民共和国中央人民政府 = Site(
) )
) )
), ),
// language=regexp sourceRule,
"https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".policyLibraryOverview_content",
author = Selector("td:contains(源:) + td"),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content .article",
title = Selector("h1#ti"),
author = Selector(
".pages-date > .font",
process = Process(
default = "中华人民共和国中央人民政府",
remove = listOf("来源:")
)
),
dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".main-content",
title = Selector(".qa_content_box"),
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("\\s*源:"))),
content = Selector(".qa_content_text"),
extra = mapOf(
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
)
)
),
) )
) )
@@ -289,12 +172,10 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
name = "中华人民共和国国家互联网信息办公室", name = "中华人民共和国国家互联网信息办公室",
home = "https://www.cac.gov.cn", home = "https://www.cac.gov.cn",
icon = "https://www.cac.gov.cn/favicon.ico", icon = "https://www.cac.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
rules = mapOf( rules = mapOf(
// language=regexp
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule( "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
list = Content( list = Content(
expression = ".xpage-container .list-item", expression = ".xpage-container .list-item",
@@ -302,24 +183,9 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
dateTime = Selector(".search_time"), dateTime = Selector(".search_time"),
link = Selector("a", "href", process = Process(prefix = "https:")) link = Selector("a", "href", process = Process(prefix = "https:"))
), ),
next = Selector( next = Selector(".xpage-pagination .xpage-pagination-next a:contains(下一页)", "href", Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/"))
".xpage-pagination .xpage-pagination-next a:contains(下一页)",
"href",
Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")
)
), ),
"https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule( sourceRule,
text = Content(
expression = ".main",
title = Selector("h1.title"),
author = Selector("#source", process = Process(remove = listOf("来源:"))),
dateTime = Selector("#pubtime"),
content = Selector(".main-content"),
extra = mapOf(
"source" to Selector(".main", properties = mapOf("html" to "true"))
)
)
)
) )
) )
@@ -330,7 +196,6 @@ val 广州数据交易所 = Site(
home = "https://www.cantonde.com", home = "https://www.cantonde.com",
icon = "https://www.cantonde.com/favicon.ico", icon = "https://www.cantonde.com/favicon.ico",
description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。", description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。",
target = Site.Target.SEARCH,
parser = Parser.Type.JSON, parser = Parser.Type.JSON,
search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D", search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
rules = mapOf( rules = mapOf(
@@ -346,7 +211,7 @@ val 广州数据交易所 = Site(
Script( Script(
Script.Type.Javascript, Script.Type.Javascript,
// language=javascript // language=javascript
"return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`", "return `https://www.cantonde.com/info.html#/infoDetail?id=\${text}`",
) )
) )
) )
@@ -365,26 +230,16 @@ val 广州数据交易所 = Site(
) )
), ),
), ),
// language=regexp sourceRule,
"https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule(
text = Content(
expression = "$.data",
title = Selector("$.TITLE"),
content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
author = Selector(process = Process(default = "广州数据交易所"))
),
properties = mapOf(post),
),
), ),
) )
val 北京市政务服务和数据管理局 = Site( private val 北京市政务服务和数据管理局 = Site(
code = "cee7f242-668b-41fb-adbc-96fb27d4bf35", code = "cee7f242-668b-41fb-adbc-96fb27d4bf35",
name = "北京市政务服务和数据管理局", name = "北京市政务服务和数据管理局",
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
home = "https://zwfwj.beijing.gov.cn", home = "https://zwfwj.beijing.gov.cn",
icon = "https://zwfwj.beijing.gov.cn/favicon.ico", icon = "https://zwfwj.beijing.gov.cn/favicon.ico",
target = Site.Target.SEARCH,
parser = Parser.Type.JSON, parser = Parser.Type.JSON,
search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20", search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20",
rules = mapOf( rules = mapOf(
@@ -420,18 +275,7 @@ val 北京市政务服务和数据管理局 = Site(
) )
) )
), ),
// language=regexp sourceRule,
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
parser = Parser.Type.CSS,
downloader = Downloader.Type.BROWSER,
text = Content(
expression = "#main .details_page",
title = Selector("h1"),
author = Selector(".article-info .ly", process = Process(remove = listOf("来源:"))),
dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间:"))),
content = Selector("#div_zhengwen")
)
)
) )
) )

View File

@@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.digtal.market.controller
import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.digtal.market.sites import com.lanyuanxiaoyao.digtal.market.sites
import com.lanyuanxiaoyao.squirrel.core.common.Site
import jakarta.annotation.Resource import jakarta.annotation.Resource
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.springframework.data.domain.PageRequest import org.springframework.data.domain.PageRequest
@@ -29,8 +30,10 @@ class OverviewController {
val result = articleRepository.findAll(request) val result = articleRepository.findAll(request)
return mapOf( return mapOf(
"items" to result.content.map { "items" to result.content.map {
val iframe = (sites.firstOrNull { site -> site.code == it.code }?.properties?.get("iframe")?: "false").toBoolean() val site = sites.find { site -> site.code == it.code }!!
val iframe = (site.options?.contains(Site.Option.OPEN_WITH_IFRAME) ?: false)
mapOf( mapOf(
"name" to site.name,
"code" to it.code, "code" to it.code,
"title" to it.title, "title" to it.title,
"subtitle" to it.subtitle, "subtitle" to it.subtitle,
@@ -40,6 +43,7 @@ class OverviewController {
"createTime" to it.createTime, "createTime" to it.createTime,
"iframe" to iframe, "iframe" to iframe,
"category" to it.category, "category" to it.category,
"tags" to it.tags,
) )
}, },
"total" to result.totalElements, "total" to result.totalElements,

View File

@@ -1,7 +1,5 @@
package com.lanyuanxiaoyao.digtal.market.controller package com.lanyuanxiaoyao.digtal.market.controller
import cn.hutool.json.JSON
import cn.hutool.json.JSONUtil
import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
import jakarta.annotation.Resource import jakarta.annotation.Resource
@@ -24,7 +22,7 @@ class SiteController {
private lateinit var management: JvmManagement private lateinit var management: JvmManagement
@GetMapping("list") @GetMapping("list")
fun list(): List<Map<String, Any>> { fun list(): List<Map<String, Any?>> {
val countMap = articleRepository.countGroupByCode().associate { it.key to it.count } val countMap = articleRepository.countGroupByCode().associate { it.key to it.count }
return management.exportSites() return management.exportSites()
.map { site -> .map { site ->
@@ -34,7 +32,7 @@ class SiteController {
"icon" to site.icon, "icon" to site.icon,
"url" to site.home, "url" to site.home,
"description" to site.description, "description" to site.description,
"iframe" to site.properties.containsKey("iframe"), "iframe" to (site.properties?.containsKey("iframe") ?: false),
"news" to (countMap[site.code] ?: 0) "news" to (countMap[site.code] ?: 0)
) )
} }

View File

@@ -1,25 +1,19 @@
package com.lanyuanxiaoyao.digtal.market.runner package com.lanyuanxiaoyao.digtal.market.runner
import cn.hutool.core.date.DateUtil
import cn.hutool.core.util.NumberUtil import cn.hutool.core.util.NumberUtil
import cn.hutool.crypto.SecureUtil
import com.lanyuanxiaoyao.digtal.market.Article
import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.digtal.market.keywords import com.lanyuanxiaoyao.digtal.market.keywords
import com.lanyuanxiaoyao.digtal.market.parseArticle
import com.lanyuanxiaoyao.digtal.market.parseArticleLink
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
import com.lanyuanxiaoyao.digtal.market.sites import com.lanyuanxiaoyao.digtal.market.sites
import com.lanyuanxiaoyao.squirrel.core.common.Management import com.lanyuanxiaoyao.squirrel.core.common.Management
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
import dev.failsafe.Failsafe
import dev.failsafe.FailsafeException
import dev.failsafe.RetryPolicy import dev.failsafe.RetryPolicy
import dev.failsafe.function.CheckedSupplier
import jakarta.annotation.Resource import jakarta.annotation.Resource
import java.util.concurrent.atomic.AtomicLong import java.util.concurrent.atomic.AtomicLong
import kotlin.time.Duration.Companion.seconds import kotlin.time.Duration.Companion.seconds
import kotlin.time.toJavaDuration import kotlin.time.toJavaDuration
import kotlinx.coroutines.runBlocking
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.springframework.scheduling.annotation.Scheduled import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Service import org.springframework.stereotype.Service
@@ -54,12 +48,18 @@ class NewsRunner : Runner {
.forEach { site -> .forEach { site ->
logger.info("站点: {}", site.name) logger.info("站点: {}", site.name)
keywords.forEach { keyword -> keywords.forEach { keyword ->
val url = site.search.replace("{query}", keyword) val url = site.search?.replace("{query}", keyword)
logger.info("类目: {}, 地址: {}", keyword, url) logger.info("正在搜索: {}, 地址: {}", keyword, url)
val hashList = articleRepository.findAllId() val links =
val links = parseArticleLink(site.code, url, false) try {
parseArticleLink(management, site.code, url, false)
} catch (e: Exception) {
logger.error("解析失败 $url", e)
emptyList()
}
val total = links.size val total = links.size
val current = AtomicLong(0) val current = AtomicLong(0)
val hashList = articleRepository.findAllId()
links links
.filter { .filter {
if (hashList.contains(it.hash)) { if (hashList.contains(it.hash)) {
@@ -68,128 +68,36 @@ class NewsRunner : Runner {
} else true } else true
} }
.forEach { link -> .forEach { link ->
Thread.sleep(500) Thread.sleep(1000)
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url) logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article -> try {
if (!article.text.isNullOrBlank()) { parseArticle(management, link)
val triple = descriptionService.parseDescription(article.text) ?.let { article ->
logger.info("小标题:{}", triple?.first) if (!article.text.isNullOrBlank() || !article.html.isNullOrBlank()) {
logger.info("描述:{}", triple?.second) val content = if (article.text.isNullOrBlank()) article.html else article.text
logger.info("相关度:{}", triple?.third) val triple = descriptionService.parseDescription(content)
article.subtitle = triple?.first logger.info("标题:{}", triple?.title)
article.description = triple?.second logger.info("作者:{}", triple?.author)
article.score = triple?.third logger.info("副标题:{}", triple?.subtitle)
} logger.info("描述:{}", triple?.description)
logger.info("标签:{}", triple?.tags)
article.category = keyword article.title = if (link.title.isNullOrBlank()) triple?.title else link.title
articleRepository.save(article) article.author = if (article.author.isNullOrBlank()) triple?.author else article.author
article.subtitle = triple?.subtitle
article.description = triple?.description
article.tags = triple?.tags
}
article.category = keyword
articleRepository.save(article)
}
} catch (e: Exception) {
logger.error("解析失败 $link", e)
} }
} }
} }
} }
logger.info("本轮采集完成") logger.info("本轮采集完成")
} }
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
logger.info("Title: {} Datetime: {}", title, datetime)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, url) }
})
} catch (e: FailsafeException) {
logger.error("Parse failure", e)
return null
}
(page["text"] as? Map<*, *>)?.let { text ->
var datetimeText = text["datetime"] as? String
val createTime = try {
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil
.date(datetimeText.toLong())
.toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
DateUtil
.date(datetimeText.toLong() * 1000)
.toTimestamp()
} else {
DateUtil
.parse(
datetimeText,
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
}
} catch (e: Exception) {
logger.error("Parse $datetimeText error", e)
null
}
val source = text["source"]?.let {
val document = Jsoup.parse((it as String))
document
.select("script")
.forEach { node -> node.remove() }
document.forEachNode { node -> node.removeAttr("style") }
document
.body()
.html()
}
return Article(
id = hash,
code = code,
url = url,
title = text["title"] as String?,
author = text["author"] as String?,
category = null,
text = if ((text["content"] as String?) == null) null else text["content"] as String,
html = source,
subtitle = null,
description = null,
score = null,
createTime = createTime,
pushed = false,
)
} ?: return null
}
data class ArticleLink(val url: String, val hash: String, val title: String?, val datetime: String?)
fun parseArticleLink(code: String, url: String, recursive: Boolean = false): List<ArticleLink> {
var next: String? = url
val links = mutableListOf<ArticleLink>()
do {
logger.info("解析目录:{}", next)
val page = try {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking { management.parse(code, next!!) }
})
} catch (e: FailsafeException) {
logger.error("Parse failure", e)
continue
}
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
for (item in list) {
val title = item["title"] as String?
val datetime = item["datetime"] as String?
val link = item["link"] as String?
if (link.isNullOrBlank()) {
logger.warn("链接为空:{} {}", title, link)
} else {
links.add(ArticleLink(link, SecureUtil.md5(link), title, datetime))
}
}
}
next = page["next"] as String?
} while (recursive && !next.isNullOrBlank())
return links
}
} }

View File

@@ -12,6 +12,14 @@ import kotlin.time.toJavaDuration
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.springframework.stereotype.Service import org.springframework.stereotype.Service
data class Description(
val title: String,
val author: String,
val subtitle: String,
val description: String,
val tags: String,
)
@Service @Service
class DescriptionService { class DescriptionService {
private val logger = LoggerFactory.getLogger(javaClass) private val logger = LoggerFactory.getLogger(javaClass)
@@ -26,7 +34,7 @@ class DescriptionService {
QianfanChat(), QianfanChat(),
) )
fun parseDescription(content: String?): Triple<String, String, Int>? { fun parseDescription(content: String?): Description? {
return content?.let { return content?.let {
if (it.isNotBlank()) { if (it.isNotBlank()) {
var description: String? var description: String?
@@ -50,11 +58,16 @@ class DescriptionService {
.replace("```json", "") .replace("```json", "")
.replace("```", "") .replace("```", "")
) )
val subtitle = root.getByPath("title", String::class.java) val title = root.getByPath("title", String::class.java)
val author = root.getByPath("author", String::class.java)
val subtitle = root.getByPath("subtitle", String::class.java)
val desc = root.getByPath("description", String::class.java) val desc = root.getByPath("description", String::class.java)
val score = root.getByPath("score", Int::class.java) val tags = root
return@let Triple(subtitle, desc, score) .getByPath("tags", String::class.java)
.replace("", ",")
return@let Description(title, author, subtitle, desc, tags)
} catch (e: Throwable) { } catch (e: Throwable) {
logger.error("json解析失败", e)
continue continue
} }
} }

View File

@@ -5,7 +5,7 @@ function pagination() {
model: 'normal', model: 'normal',
maxButtons: 10, maxButtons: 10,
showPageInput: false, showPageInput: false,
perPageAvailable: [10, 15, 20], perPageAvailable: [10, 15, 20, 50, 100],
activePage: '${page|default:1}', activePage: '${page|default:1}',
total: '${total|default:0}', total: '${total|default:0}',
className: 'text-right', className: 'text-right',
@@ -131,6 +131,11 @@ function overviewTab() {
className: 'text-current', className: 'text-current',
tpl: '${description}', tpl: '${description}',
}, },
{
type: 'tpl',
className: 'text-blue-900 text-sm mt-2',
tpl: '${name}',
},
{ {
type: 'wrapper', type: 'wrapper',
size: 'none', size: 'none',
@@ -152,10 +157,21 @@ function overviewTab() {
type: 'tag', type: 'tag',
label: '${author}', label: '${author}',
displayMode: 'rounded', displayMode: 'rounded',
color: '#ff8888', color: '#bd6464',
}, },
] ]
}, },
{
type: 'each',
className: 'mt-2',
source: "${SPLIT(tags, ',')}",
items: {
type: 'tag',
label: '${item}',
displayMode: 'rounded',
color: '#6b3481',
}
}
] ]
}, },
}, },

View File

@@ -48,7 +48,7 @@
] ]
} }
} }
let debug = true let debug = false
let server = amis.embed( let server = amis.embed(
'#root', '#root',
amisJSON, amisJSON,

View File

@@ -1,6 +1,7 @@
package com.lanyuanxiaoyao.digtal.market package com.lanyuanxiaoyao.digtal.market
import cn.hutool.core.date.DateUtil import cn.hutool.core.date.DateUtil
import cn.hutool.core.io.FileUtil
import com.lanyuanxiaoyao.digtal.market.ai.Chat import com.lanyuanxiaoyao.digtal.market.ai.Chat
import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat
import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
@@ -144,4 +145,18 @@ class Test {
) )
) )
} }
@Test
fun testNativeDownload() {
val downloader = BasicDownloaderFactory().build(emptyMap())
val page = runBlocking {
downloader.download("http://zfsg.gd.gov.cn/xxfb/dtxw/content/post_4515949.html")
}
val root = Jsoup.parse(page).body()
root.select("script").forEach { it.remove() }
root.select("style").forEach { it.remove() }
root.select("link").forEach { it.remove() }
root.allElements.forEach { it.removeAttr("style") }
FileUtil.writeString(root.html(), "/Users/lanyuanxiaoyao/Project/IdeaProjects/digtal-market/source.txt", Charsets.UTF_8)
}
} }

View File

@@ -24,18 +24,19 @@ class TestManagement {
@Test @Test
fun testParse() { fun testParse() {
newsRunner parseArticleLink(
.parseArticleLink( management,
"9a7f1d8f-4f39-4120-adeb-7435339b97bb", "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
"https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22数据要素%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D", "https://www.gov.cn/zhengce/content/202409/content_6977766.htm",
) )
.forEach { link -> .forEach { link ->
val article = newsRunner.parseArticle( val article = parseArticle(
management,
"9a7f1d8f-4f39-4120-adeb-7435339b97bb", "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
link.url, link.url,
"",
link.title, link.title,
link.datetime, link.datetime,
"",
) )
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text) logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
} }
@@ -54,26 +55,26 @@ class TestManagement {
@Test @Test
fun testParseList() { fun testParseList() {
newsRunner parseArticleLink(
.parseArticleLink( management,
"cee7f242-668b-41fb-adbc-96fb27d4bf35", "cee7f242-668b-41fb-adbc-96fb27d4bf35",
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20", "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
true, true,
) )
// .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } // .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
.let { it.forEach { logger.info("{}", it.url) } } .let { it.forEach { logger.info("{}", it.url) } }
} }
@Test @Test
fun testParseArticle() { fun testParseArticle() {
newsRunner parseArticle(
.parseArticle( management,
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be", "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm", "https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知", "",
"2024-11-01 12:48:26", "工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
"", "2024-11-01 12:48:26",
) )
?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } ?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
} }
} }

View File

@@ -14,8 +14,8 @@ class TestRule {
private lateinit var management: Management private lateinit var management: Management
private val link = private val link =
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html" "https://www.gov.cn/zhengce/202410/content_6979047.htm"
private val site = 北京市政务服务和数据管理局 private val site = sites.find { it.code == "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be" }!!
@BeforeTest @BeforeTest
fun before() { fun before() {

View File

@@ -71,3 +71,16 @@ Content-Type: application/json
### Search ### Search
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20 POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
### Search
POST https://www.cantonde.com/si/common/searchInfo
Content-Type: application/json
{
"NAME": "数据要素",
"IN_CATEGORY": "",
"NOT_IN_CATEGORY": "",
"CATEGORY": "",
"pageNo": 1,
"pageSize": 10
}