feat: 适配squirrel改造
增加AI解析页面、优化解析流程
This commit is contained in:
@@ -24,7 +24,7 @@ class Article(
|
||||
@Column(columnDefinition = "longtext") var html: String?,
|
||||
var subtitle: String?,
|
||||
@Column(columnDefinition = "longtext") var description: String?,
|
||||
var score: Int?,
|
||||
var tags: String?,
|
||||
var createTime: Date?,
|
||||
var pushed: Boolean?,
|
||||
)
|
||||
@@ -41,7 +41,7 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
|
||||
@Query("update Article article set article.pushed = :pushed where article.id = :id")
|
||||
fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean)
|
||||
|
||||
@Query("select article.id from Article article where article.description is not null and article.subtitle is not null and article.score is not null")
|
||||
@Query("select article.id from Article article where article.description is not null and article.text is not null and article.text <> ''")
|
||||
fun findAllId(): List<String>
|
||||
|
||||
@Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code")
|
||||
|
||||
153
src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt
Normal file
153
src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt
Normal file
@@ -0,0 +1,153 @@
|
||||
package com.lanyuanxiaoyao.digtal.market
|
||||
|
||||
import cn.hutool.core.date.DateUtil
|
||||
import cn.hutool.crypto.SecureUtil
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
|
||||
import dev.failsafe.Failsafe
|
||||
import dev.failsafe.FailsafeException
|
||||
import dev.failsafe.RetryPolicy
|
||||
import dev.failsafe.function.CheckedSupplier
|
||||
import kotlin.time.Duration.Companion.seconds
|
||||
import kotlin.time.toJavaDuration
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.jsoup.Jsoup
|
||||
import org.slf4j.LoggerFactory
|
||||
|
||||
data class ArticleLink(
|
||||
val code: String,
|
||||
val url: String,
|
||||
val hash: String,
|
||||
val title: String?,
|
||||
val datetime: String?,
|
||||
)
|
||||
|
||||
private val logger = LoggerFactory.getLogger("Helper")
|
||||
|
||||
private val retryPolicy = RetryPolicy
|
||||
.builder<Any>()
|
||||
.withDelay(10.seconds.toJavaDuration())
|
||||
.withMaxRetries(2)
|
||||
.handleIf { e -> (e is PageParseException).not() }
|
||||
.build()
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
fun parseArticleLink(management: Management, code: String, url: String?, recursive: Boolean = false): List<ArticleLink> {
|
||||
var next: String? = url
|
||||
val links = mutableListOf<ArticleLink>()
|
||||
do {
|
||||
val page = try {
|
||||
Failsafe
|
||||
.with(retryPolicy)
|
||||
.get(CheckedSupplier {
|
||||
runBlocking { management.parse(code, next!!) }
|
||||
})
|
||||
} catch (e: FailsafeException) {
|
||||
logger.error("解析失败:${next}", e)
|
||||
continue
|
||||
}
|
||||
(page["list"] as? List<Map<String, Any>>)
|
||||
?.let { list ->
|
||||
for (item in list) {
|
||||
val link = item["link"] as String?
|
||||
if (link.isNullOrBlank()) {
|
||||
logger.warn("链接为空:{} {}", item["title"] as String?, next)
|
||||
} else {
|
||||
links.add(
|
||||
ArticleLink(
|
||||
code,
|
||||
link,
|
||||
SecureUtil.md5(link),
|
||||
item["title"] as String?,
|
||||
item["datetime"] as String?,
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
next = page["next"] as String?
|
||||
} while (recursive && !next.isNullOrBlank())
|
||||
return links
|
||||
}
|
||||
|
||||
fun parseArticle(management: Management, articleLink: ArticleLink): Article? {
|
||||
val (code, url, hash, title, datetime) = articleLink
|
||||
return parseArticle(management, code, url, hash, title, datetime)
|
||||
}
|
||||
|
||||
fun parseArticle(management: Management, code: String, url: String, hash: String, title: String?, datetime: String?): Article? {
|
||||
logger.info("标题:{} 时间:{}", title, datetime)
|
||||
val page = try {
|
||||
Failsafe
|
||||
.with(retryPolicy)
|
||||
.get(CheckedSupplier {
|
||||
runBlocking { management.parse(code, url) }
|
||||
})
|
||||
} catch (e: FailsafeException) {
|
||||
logger.error("解析失败:${url}", e)
|
||||
return null
|
||||
}
|
||||
(page["text"] as? Map<*, *>)?.let { text ->
|
||||
var datetimeText = text["datetime"] as? String
|
||||
val createTime = try {
|
||||
if (datetimeText.isNullOrBlank()) {
|
||||
datetimeText = datetime
|
||||
}
|
||||
if (datetimeText?.matches(Regex("\\d{13}")) == true) {
|
||||
DateUtil
|
||||
.date(datetimeText.toLong())
|
||||
.toTimestamp()
|
||||
} else if (datetimeText?.matches(Regex("\\d{10}")) == true) {
|
||||
DateUtil
|
||||
.date(datetimeText.toLong() * 1000)
|
||||
.toTimestamp()
|
||||
} else {
|
||||
DateUtil
|
||||
.parse(
|
||||
datetimeText,
|
||||
"yyyy-MM-dd HH:mm:ss",
|
||||
"yyyy-MM-dd",
|
||||
"yyyy年MM月dd日 HH:mm",
|
||||
"yyyy-MM-dd HH:mm",
|
||||
)
|
||||
.toTimestamp()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
logger.error("解析 $datetimeText 失败", e)
|
||||
null
|
||||
}
|
||||
val sourceText = (text["source"] as? String) ?: ""
|
||||
val source = if (sourceText.isBlank()) {
|
||||
sourceText
|
||||
} else {
|
||||
sourceText.let {
|
||||
val document = Jsoup.parse((it as String))
|
||||
document
|
||||
.select("script")
|
||||
.forEach { it.remove() }
|
||||
document
|
||||
.select("link")
|
||||
.forEach { it.remove() }
|
||||
document.forEachNode { node -> node.removeAttr("style") }
|
||||
document
|
||||
.body()
|
||||
.html()
|
||||
}
|
||||
}
|
||||
return Article(
|
||||
id = hash,
|
||||
code = code,
|
||||
url = url,
|
||||
title = null,
|
||||
author = null,
|
||||
category = null,
|
||||
text = text["content"] as String? ?: "",
|
||||
html = source,
|
||||
subtitle = null,
|
||||
description = null,
|
||||
tags = null,
|
||||
createTime = createTime,
|
||||
pushed = false,
|
||||
)
|
||||
} ?: return null
|
||||
}
|
||||
@@ -11,11 +11,11 @@ import com.lanyuanxiaoyao.squirrel.core.common.Rule
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Script
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Selector
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Site
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Site.Option.Companion.OPEN_WITH_IFRAME
|
||||
|
||||
private val html = Pair("html", "true")
|
||||
private val iframe = Pair("iframe", "true")
|
||||
private val post = Pair("post", "true")
|
||||
private val form = Pair("form", "true")
|
||||
private val html = "html" to "true"
|
||||
private val post = "post" to "true"
|
||||
private val form = "form" to "true"
|
||||
|
||||
// language=regexp
|
||||
private val commonRemove = listOf(
|
||||
@@ -23,19 +23,29 @@ private val commonRemove = listOf(
|
||||
"&.+?;"
|
||||
)
|
||||
|
||||
private val sourceRule =
|
||||
// language=regexp
|
||||
"https*://.+" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
parser = Parser.Type.CSS,
|
||||
text = Content(
|
||||
expression = "body",
|
||||
content = Selector(":root"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(":root", properties = mapOf(html))
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
private val 广东政务服务和数据管理局 = Site(
|
||||
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
name = "广东政务服务和数据管理局",
|
||||
home = "https://zfsg.gd.gov.cn",
|
||||
parser = Parser.Type.CSS,
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.SEARCH,
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https://search.gd.gov.cn/api/search/all.*" to Rule(
|
||||
downloader = Downloader.Type.HTTP,
|
||||
parser = Parser.Type.JSON,
|
||||
list = Content(
|
||||
expression = "$.data.news.list",
|
||||
@@ -57,71 +67,7 @@ private val 广东政务服务和数据管理局 = Site(
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".content-container",
|
||||
title = Selector(".content-box .content h1.title"),
|
||||
author = Selector("td.first:contains(发布机构) + td > span"),
|
||||
dateTime = Selector("td.second:contains(成文日期) + td > span"),
|
||||
content = Selector(".content .article-content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".content .article-content", properties = mapOf(html))
|
||||
),
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
|
||||
text = Content(
|
||||
expression = ".Con",
|
||||
title = Selector("h3.zw-title"),
|
||||
author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
|
||||
dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
|
||||
content = Selector(".zw"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".zw", properties = mapOf(html))
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
|
||||
text = Content(
|
||||
expression = "#page-content",
|
||||
title = Selector("#activity-name"),
|
||||
author = Selector("#js_name"),
|
||||
dateTime = Selector("#publish_time"),
|
||||
content = Selector("#js_content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector("#js_content", properties = mapOf(html))
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
|
||||
text = Content(
|
||||
expression = "#article-container",
|
||||
title = Selector("#article-title"),
|
||||
author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))),
|
||||
dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))),
|
||||
content = Selector("#article-content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector("#article-content", properties = mapOf(html))
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
|
||||
text = Content(
|
||||
expression = "body",
|
||||
title = Selector(".title-page .txt > span"),
|
||||
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
|
||||
content = Selector(".content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".content", properties = mapOf(html))
|
||||
)
|
||||
)
|
||||
),
|
||||
sourceRule,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -130,11 +76,8 @@ private val 深圳市政务服务和数据管理局 = Site(
|
||||
name = "深圳市政务服务和数据管理局",
|
||||
home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
|
||||
icon = "https://www.sz.gov.cn/favicon.ico",
|
||||
parser = Parser.Type.CSS,
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
properties = mapOf(iframe),
|
||||
options = listOf(OPEN_WITH_IFRAME),
|
||||
search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
@@ -179,20 +122,7 @@ private val 深圳市政务服务和数据管理局 = Site(
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".content-container",
|
||||
title = Selector(".content-box .content h1.title"),
|
||||
author = Selector("td.first:contains(发布机构) + td > span"),
|
||||
dateTime = Selector("td.second:contains(成文日期) + td > span"),
|
||||
content = Selector(".content .article-content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".content .article-content", properties = mapOf(html))
|
||||
),
|
||||
)
|
||||
)
|
||||
sourceRule,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -201,12 +131,9 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
name = "中华人民共和国中央人民政府",
|
||||
home = "https://www.gov.cn",
|
||||
icon = "https://www.gov.cn/favicon.ico",
|
||||
parser = Parser.Type.CSS,
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
properties = mapOf(iframe),
|
||||
search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
|
||||
options = listOf(OPEN_WITH_IFRAME),
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
|
||||
@@ -216,10 +143,10 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
"Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
|
||||
"Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
|
||||
),
|
||||
properties = mapOf("post" to "true"),
|
||||
properties = mapOf(post),
|
||||
list = Content(
|
||||
expression = "$.result.data.middle.list",
|
||||
title = Selector("$.title_no_tag"),
|
||||
title = Selector("$.title_no_tag", process = Process(remove = commonRemove)),
|
||||
dateTime = Selector("$.time"),
|
||||
link = Selector("$.url"),
|
||||
),
|
||||
@@ -236,51 +163,7 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".policyLibraryOverview_content",
|
||||
author = Selector("td:contains(源:) + td"),
|
||||
content = Selector(".pages_content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".content .article",
|
||||
title = Selector("h1#ti"),
|
||||
author = Selector(
|
||||
".pages-date > .font",
|
||||
process = Process(
|
||||
default = "中华人民共和国中央人民政府",
|
||||
remove = listOf("来源:")
|
||||
)
|
||||
),
|
||||
dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
|
||||
content = Selector(".pages_content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".main-content",
|
||||
title = Selector(".qa_content_box"),
|
||||
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))),
|
||||
content = Selector(".qa_content_text"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
|
||||
)
|
||||
)
|
||||
),
|
||||
sourceRule,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -289,12 +172,10 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
|
||||
name = "中华人民共和国国家互联网信息办公室",
|
||||
home = "https://www.cac.gov.cn",
|
||||
icon = "https://www.cac.gov.cn/favicon.ico",
|
||||
parser = Parser.Type.CSS,
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
|
||||
list = Content(
|
||||
expression = ".xpage-container .list-item",
|
||||
@@ -302,24 +183,9 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
|
||||
dateTime = Selector(".search_time"),
|
||||
link = Selector("a", "href", process = Process(prefix = "https:"))
|
||||
),
|
||||
next = Selector(
|
||||
".xpage-pagination .xpage-pagination-next a:contains(下一页)",
|
||||
"href",
|
||||
Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")
|
||||
)
|
||||
next = Selector(".xpage-pagination .xpage-pagination-next a:contains(下一页)", "href", Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/"))
|
||||
),
|
||||
"https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule(
|
||||
text = Content(
|
||||
expression = ".main",
|
||||
title = Selector("h1.title"),
|
||||
author = Selector("#source", process = Process(remove = listOf("来源:"))),
|
||||
dateTime = Selector("#pubtime"),
|
||||
content = Selector(".main-content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".main", properties = mapOf("html" to "true"))
|
||||
)
|
||||
)
|
||||
)
|
||||
sourceRule,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -330,7 +196,6 @@ val 广州数据交易所 = Site(
|
||||
home = "https://www.cantonde.com",
|
||||
icon = "https://www.cantonde.com/favicon.ico",
|
||||
description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。",
|
||||
target = Site.Target.SEARCH,
|
||||
parser = Parser.Type.JSON,
|
||||
search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
|
||||
rules = mapOf(
|
||||
@@ -346,7 +211,7 @@ val 广州数据交易所 = Site(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`",
|
||||
"return `https://www.cantonde.com/info.html#/infoDetail?id=\${text}`",
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -365,26 +230,16 @@ val 广州数据交易所 = Site(
|
||||
)
|
||||
),
|
||||
),
|
||||
// language=regexp
|
||||
"https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule(
|
||||
text = Content(
|
||||
expression = "$.data",
|
||||
title = Selector("$.TITLE"),
|
||||
content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
|
||||
author = Selector(process = Process(default = "广州数据交易所"))
|
||||
),
|
||||
properties = mapOf(post),
|
||||
),
|
||||
sourceRule,
|
||||
),
|
||||
)
|
||||
|
||||
val 北京市政务服务和数据管理局 = Site(
|
||||
private val 北京市政务服务和数据管理局 = Site(
|
||||
code = "cee7f242-668b-41fb-adbc-96fb27d4bf35",
|
||||
name = "北京市政务服务和数据管理局",
|
||||
author = "lanyuanxiaoyao",
|
||||
home = "https://zwfwj.beijing.gov.cn",
|
||||
icon = "https://zwfwj.beijing.gov.cn/favicon.ico",
|
||||
target = Site.Target.SEARCH,
|
||||
parser = Parser.Type.JSON,
|
||||
search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20",
|
||||
rules = mapOf(
|
||||
@@ -420,18 +275,7 @@ val 北京市政务服务和数据管理局 = Site(
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
|
||||
parser = Parser.Type.CSS,
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = "#main .details_page",
|
||||
title = Selector("h1"),
|
||||
author = Selector(".article-info .ly", process = Process(remove = listOf("来源:"))),
|
||||
dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间:"))),
|
||||
content = Selector("#div_zhengwen")
|
||||
)
|
||||
)
|
||||
sourceRule,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.digtal.market.controller
|
||||
|
||||
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
||||
import com.lanyuanxiaoyao.digtal.market.sites
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Site
|
||||
import jakarta.annotation.Resource
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.springframework.data.domain.PageRequest
|
||||
@@ -29,8 +30,10 @@ class OverviewController {
|
||||
val result = articleRepository.findAll(request)
|
||||
return mapOf(
|
||||
"items" to result.content.map {
|
||||
val iframe = (sites.firstOrNull { site -> site.code == it.code }?.properties?.get("iframe")?: "false").toBoolean()
|
||||
val site = sites.find { site -> site.code == it.code }!!
|
||||
val iframe = (site.options?.contains(Site.Option.OPEN_WITH_IFRAME) ?: false)
|
||||
mapOf(
|
||||
"name" to site.name,
|
||||
"code" to it.code,
|
||||
"title" to it.title,
|
||||
"subtitle" to it.subtitle,
|
||||
@@ -40,6 +43,7 @@ class OverviewController {
|
||||
"createTime" to it.createTime,
|
||||
"iframe" to iframe,
|
||||
"category" to it.category,
|
||||
"tags" to it.tags,
|
||||
)
|
||||
},
|
||||
"total" to result.totalElements,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
package com.lanyuanxiaoyao.digtal.market.controller
|
||||
|
||||
import cn.hutool.json.JSON
|
||||
import cn.hutool.json.JSONUtil
|
||||
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
||||
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
|
||||
import jakarta.annotation.Resource
|
||||
@@ -24,7 +22,7 @@ class SiteController {
|
||||
private lateinit var management: JvmManagement
|
||||
|
||||
@GetMapping("list")
|
||||
fun list(): List<Map<String, Any>> {
|
||||
fun list(): List<Map<String, Any?>> {
|
||||
val countMap = articleRepository.countGroupByCode().associate { it.key to it.count }
|
||||
return management.exportSites()
|
||||
.map { site ->
|
||||
@@ -34,7 +32,7 @@ class SiteController {
|
||||
"icon" to site.icon,
|
||||
"url" to site.home,
|
||||
"description" to site.description,
|
||||
"iframe" to site.properties.containsKey("iframe"),
|
||||
"iframe" to (site.properties?.containsKey("iframe") ?: false),
|
||||
"news" to (countMap[site.code] ?: 0)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,25 +1,19 @@
|
||||
package com.lanyuanxiaoyao.digtal.market.runner
|
||||
|
||||
import cn.hutool.core.date.DateUtil
|
||||
import cn.hutool.core.util.NumberUtil
|
||||
import cn.hutool.crypto.SecureUtil
|
||||
import com.lanyuanxiaoyao.digtal.market.Article
|
||||
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
||||
import com.lanyuanxiaoyao.digtal.market.keywords
|
||||
import com.lanyuanxiaoyao.digtal.market.parseArticle
|
||||
import com.lanyuanxiaoyao.digtal.market.parseArticleLink
|
||||
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
|
||||
import com.lanyuanxiaoyao.digtal.market.sites
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
|
||||
import dev.failsafe.Failsafe
|
||||
import dev.failsafe.FailsafeException
|
||||
import dev.failsafe.RetryPolicy
|
||||
import dev.failsafe.function.CheckedSupplier
|
||||
import jakarta.annotation.Resource
|
||||
import java.util.concurrent.atomic.AtomicLong
|
||||
import kotlin.time.Duration.Companion.seconds
|
||||
import kotlin.time.toJavaDuration
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.jsoup.Jsoup
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.springframework.scheduling.annotation.Scheduled
|
||||
import org.springframework.stereotype.Service
|
||||
@@ -54,12 +48,18 @@ class NewsRunner : Runner {
|
||||
.forEach { site ->
|
||||
logger.info("站点: {}", site.name)
|
||||
keywords.forEach { keyword ->
|
||||
val url = site.search.replace("{query}", keyword)
|
||||
logger.info("类目: {}, 地址: {}", keyword, url)
|
||||
val hashList = articleRepository.findAllId()
|
||||
val links = parseArticleLink(site.code, url, false)
|
||||
val url = site.search?.replace("{query}", keyword)
|
||||
logger.info("正在搜索: {}, 地址: {}", keyword, url)
|
||||
val links =
|
||||
try {
|
||||
parseArticleLink(management, site.code, url, false)
|
||||
} catch (e: Exception) {
|
||||
logger.error("解析失败 $url", e)
|
||||
emptyList()
|
||||
}
|
||||
val total = links.size
|
||||
val current = AtomicLong(0)
|
||||
val hashList = articleRepository.findAllId()
|
||||
links
|
||||
.filter {
|
||||
if (hashList.contains(it.hash)) {
|
||||
@@ -68,128 +68,36 @@ class NewsRunner : Runner {
|
||||
} else true
|
||||
}
|
||||
.forEach { link ->
|
||||
Thread.sleep(500)
|
||||
Thread.sleep(1000)
|
||||
logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
|
||||
parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
|
||||
if (!article.text.isNullOrBlank()) {
|
||||
val triple = descriptionService.parseDescription(article.text)
|
||||
logger.info("小标题:{}", triple?.first)
|
||||
logger.info("描述:{}", triple?.second)
|
||||
logger.info("相关度:{}", triple?.third)
|
||||
article.subtitle = triple?.first
|
||||
article.description = triple?.second
|
||||
article.score = triple?.third
|
||||
}
|
||||
try {
|
||||
parseArticle(management, link)
|
||||
?.let { article ->
|
||||
if (!article.text.isNullOrBlank() || !article.html.isNullOrBlank()) {
|
||||
val content = if (article.text.isNullOrBlank()) article.html else article.text
|
||||
val triple = descriptionService.parseDescription(content)
|
||||
logger.info("标题:{}", triple?.title)
|
||||
logger.info("作者:{}", triple?.author)
|
||||
logger.info("副标题:{}", triple?.subtitle)
|
||||
logger.info("描述:{}", triple?.description)
|
||||
logger.info("标签:{}", triple?.tags)
|
||||
|
||||
article.category = keyword
|
||||
articleRepository.save(article)
|
||||
article.title = if (link.title.isNullOrBlank()) triple?.title else link.title
|
||||
article.author = if (article.author.isNullOrBlank()) triple?.author else article.author
|
||||
article.subtitle = triple?.subtitle
|
||||
article.description = triple?.description
|
||||
article.tags = triple?.tags
|
||||
}
|
||||
|
||||
article.category = keyword
|
||||
articleRepository.save(article)
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
logger.error("解析失败 $link", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("本轮采集完成")
|
||||
}
|
||||
|
||||
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
|
||||
logger.info("Title: {} Datetime: {}", title, datetime)
|
||||
val page = try {
|
||||
Failsafe
|
||||
.with(retryPolicy)
|
||||
.get(CheckedSupplier {
|
||||
runBlocking { management.parse(code, url) }
|
||||
})
|
||||
} catch (e: FailsafeException) {
|
||||
logger.error("Parse failure", e)
|
||||
return null
|
||||
}
|
||||
(page["text"] as? Map<*, *>)?.let { text ->
|
||||
var datetimeText = text["datetime"] as? String
|
||||
val createTime = try {
|
||||
if (datetimeText.isNullOrBlank()) {
|
||||
datetimeText = datetime
|
||||
}
|
||||
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
|
||||
DateUtil
|
||||
.date(datetimeText.toLong())
|
||||
.toTimestamp()
|
||||
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
|
||||
DateUtil
|
||||
.date(datetimeText.toLong() * 1000)
|
||||
.toTimestamp()
|
||||
} else {
|
||||
DateUtil
|
||||
.parse(
|
||||
datetimeText,
|
||||
"yyyy-MM-dd HH:mm:ss",
|
||||
"yyyy-MM-dd",
|
||||
"yyyy年MM月dd日 HH:mm",
|
||||
"yyyy-MM-dd HH:mm",
|
||||
)
|
||||
.toTimestamp()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
logger.error("Parse $datetimeText error", e)
|
||||
null
|
||||
}
|
||||
val source = text["source"]?.let {
|
||||
val document = Jsoup.parse((it as String))
|
||||
document
|
||||
.select("script")
|
||||
.forEach { node -> node.remove() }
|
||||
document.forEachNode { node -> node.removeAttr("style") }
|
||||
document
|
||||
.body()
|
||||
.html()
|
||||
}
|
||||
return Article(
|
||||
id = hash,
|
||||
code = code,
|
||||
url = url,
|
||||
title = text["title"] as String?,
|
||||
author = text["author"] as String?,
|
||||
category = null,
|
||||
text = if ((text["content"] as String?) == null) null else text["content"] as String,
|
||||
html = source,
|
||||
subtitle = null,
|
||||
description = null,
|
||||
score = null,
|
||||
createTime = createTime,
|
||||
pushed = false,
|
||||
)
|
||||
} ?: return null
|
||||
}
|
||||
|
||||
data class ArticleLink(val url: String, val hash: String, val title: String?, val datetime: String?)
|
||||
|
||||
fun parseArticleLink(code: String, url: String, recursive: Boolean = false): List<ArticleLink> {
|
||||
var next: String? = url
|
||||
val links = mutableListOf<ArticleLink>()
|
||||
do {
|
||||
logger.info("解析目录:{}", next)
|
||||
val page = try {
|
||||
Failsafe
|
||||
.with(retryPolicy)
|
||||
.get(CheckedSupplier {
|
||||
runBlocking { management.parse(code, next!!) }
|
||||
})
|
||||
} catch (e: FailsafeException) {
|
||||
logger.error("Parse failure", e)
|
||||
continue
|
||||
}
|
||||
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
|
||||
for (item in list) {
|
||||
val title = item["title"] as String?
|
||||
val datetime = item["datetime"] as String?
|
||||
val link = item["link"] as String?
|
||||
if (link.isNullOrBlank()) {
|
||||
logger.warn("链接为空:{} {}", title, link)
|
||||
} else {
|
||||
links.add(ArticleLink(link, SecureUtil.md5(link), title, datetime))
|
||||
}
|
||||
}
|
||||
}
|
||||
next = page["next"] as String?
|
||||
} while (recursive && !next.isNullOrBlank())
|
||||
return links
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,14 @@ import kotlin.time.toJavaDuration
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.springframework.stereotype.Service
|
||||
|
||||
data class Description(
|
||||
val title: String,
|
||||
val author: String,
|
||||
val subtitle: String,
|
||||
val description: String,
|
||||
val tags: String,
|
||||
)
|
||||
|
||||
@Service
|
||||
class DescriptionService {
|
||||
private val logger = LoggerFactory.getLogger(javaClass)
|
||||
@@ -26,7 +34,7 @@ class DescriptionService {
|
||||
QianfanChat(),
|
||||
)
|
||||
|
||||
fun parseDescription(content: String?): Triple<String, String, Int>? {
|
||||
fun parseDescription(content: String?): Description? {
|
||||
return content?.let {
|
||||
if (it.isNotBlank()) {
|
||||
var description: String?
|
||||
@@ -50,11 +58,16 @@ class DescriptionService {
|
||||
.replace("```json", "")
|
||||
.replace("```", "")
|
||||
)
|
||||
val subtitle = root.getByPath("title", String::class.java)
|
||||
val title = root.getByPath("title", String::class.java)
|
||||
val author = root.getByPath("author", String::class.java)
|
||||
val subtitle = root.getByPath("subtitle", String::class.java)
|
||||
val desc = root.getByPath("description", String::class.java)
|
||||
val score = root.getByPath("score", Int::class.java)
|
||||
return@let Triple(subtitle, desc, score)
|
||||
val tags = root
|
||||
.getByPath("tags", String::class.java)
|
||||
.replace(",", ",")
|
||||
return@let Description(title, author, subtitle, desc, tags)
|
||||
} catch (e: Throwable) {
|
||||
logger.error("json解析失败", e)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ function pagination() {
|
||||
model: 'normal',
|
||||
maxButtons: 10,
|
||||
showPageInput: false,
|
||||
perPageAvailable: [10, 15, 20],
|
||||
perPageAvailable: [10, 15, 20, 50, 100],
|
||||
activePage: '${page|default:1}',
|
||||
total: '${total|default:0}',
|
||||
className: 'text-right',
|
||||
@@ -131,6 +131,11 @@ function overviewTab() {
|
||||
className: 'text-current',
|
||||
tpl: '${description}',
|
||||
},
|
||||
{
|
||||
type: 'tpl',
|
||||
className: 'text-blue-900 text-sm mt-2',
|
||||
tpl: '${name}',
|
||||
},
|
||||
{
|
||||
type: 'wrapper',
|
||||
size: 'none',
|
||||
@@ -152,10 +157,21 @@ function overviewTab() {
|
||||
type: 'tag',
|
||||
label: '${author}',
|
||||
displayMode: 'rounded',
|
||||
color: '#ff8888',
|
||||
color: '#bd6464',
|
||||
},
|
||||
]
|
||||
},
|
||||
{
|
||||
type: 'each',
|
||||
className: 'mt-2',
|
||||
source: "${SPLIT(tags, ',')}",
|
||||
items: {
|
||||
type: 'tag',
|
||||
label: '${item}',
|
||||
displayMode: 'rounded',
|
||||
color: '#6b3481',
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
]
|
||||
}
|
||||
}
|
||||
let debug = true
|
||||
let debug = false
|
||||
let server = amis.embed(
|
||||
'#root',
|
||||
amisJSON,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.lanyuanxiaoyao.digtal.market
|
||||
|
||||
import cn.hutool.core.date.DateUtil
|
||||
import cn.hutool.core.io.FileUtil
|
||||
import com.lanyuanxiaoyao.digtal.market.ai.Chat
|
||||
import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat
|
||||
import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
|
||||
@@ -144,4 +145,18 @@ class Test {
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testNativeDownload() {
|
||||
val downloader = BasicDownloaderFactory().build(emptyMap())
|
||||
val page = runBlocking {
|
||||
downloader.download("http://zfsg.gd.gov.cn/xxfb/dtxw/content/post_4515949.html")
|
||||
}
|
||||
val root = Jsoup.parse(page).body()
|
||||
root.select("script").forEach { it.remove() }
|
||||
root.select("style").forEach { it.remove() }
|
||||
root.select("link").forEach { it.remove() }
|
||||
root.allElements.forEach { it.removeAttr("style") }
|
||||
FileUtil.writeString(root.html(), "/Users/lanyuanxiaoyao/Project/IdeaProjects/digtal-market/source.txt", Charsets.UTF_8)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,18 +24,19 @@ class TestManagement {
|
||||
|
||||
@Test
|
||||
fun testParse() {
|
||||
newsRunner
|
||||
.parseArticleLink(
|
||||
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
|
||||
"https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22数据要素%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
|
||||
)
|
||||
parseArticleLink(
|
||||
management,
|
||||
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
|
||||
"https://www.gov.cn/zhengce/content/202409/content_6977766.htm",
|
||||
)
|
||||
.forEach { link ->
|
||||
val article = newsRunner.parseArticle(
|
||||
val article = parseArticle(
|
||||
management,
|
||||
"9a7f1d8f-4f39-4120-adeb-7435339b97bb",
|
||||
link.url,
|
||||
"",
|
||||
link.title,
|
||||
link.datetime,
|
||||
"",
|
||||
)
|
||||
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
|
||||
}
|
||||
@@ -54,26 +55,26 @@ class TestManagement {
|
||||
|
||||
@Test
|
||||
fun testParseList() {
|
||||
newsRunner
|
||||
.parseArticleLink(
|
||||
"cee7f242-668b-41fb-adbc-96fb27d4bf35",
|
||||
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
|
||||
true,
|
||||
)
|
||||
parseArticleLink(
|
||||
management,
|
||||
"cee7f242-668b-41fb-adbc-96fb27d4bf35",
|
||||
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
|
||||
true,
|
||||
)
|
||||
// .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
||||
.let { it.forEach { logger.info("{}", it.url) } }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testParseArticle() {
|
||||
newsRunner
|
||||
.parseArticle(
|
||||
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
|
||||
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
|
||||
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
|
||||
"2024-11-01 12:48:26",
|
||||
"",
|
||||
)
|
||||
parseArticle(
|
||||
management,
|
||||
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
|
||||
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
|
||||
"",
|
||||
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
|
||||
"2024-11-01 12:48:26",
|
||||
)
|
||||
?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
||||
}
|
||||
}
|
||||
@@ -14,8 +14,8 @@ class TestRule {
|
||||
private lateinit var management: Management
|
||||
|
||||
private val link =
|
||||
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html"
|
||||
private val site = 北京市政务服务和数据管理局
|
||||
"https://www.gov.cn/zhengce/202410/content_6979047.htm"
|
||||
private val site = sites.find { it.code == "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be" }!!
|
||||
|
||||
@BeforeTest
|
||||
fun before() {
|
||||
|
||||
@@ -70,4 +70,17 @@ Content-Type: application/json
|
||||
}
|
||||
|
||||
### Search
|
||||
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
|
||||
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
|
||||
|
||||
### Search
|
||||
POST https://www.cantonde.com/si/common/searchInfo
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"NAME": "数据要素",
|
||||
"IN_CATEGORY": "",
|
||||
"NOT_IN_CATEGORY": "",
|
||||
"CATEGORY": "",
|
||||
"pageNo": 1,
|
||||
"pageSize": 10
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user