1
0

feat: 增加spring环境的测试

This commit is contained in:
2024-11-04 09:23:38 +08:00
parent a89f4ab8b1
commit 5829b6e145
5 changed files with 329 additions and 233 deletions

View File

@@ -34,7 +34,9 @@ repositories {
}
dependencies {
implementation("com.lanyuanxiaoyao:squirrel-core-jvm:1.0.0-SNAPSHOT")
implementation("com.lanyuanxiaoyao:squirrel-core-jvm:1.0.0-SNAPSHOT") {
exclude("commons-logging", "commons-logging")
}
implementation("dev.failsafe:failsafe:3.3.2")
implementation("cn.bigmodel.openapi:oapi-java-sdk:release-V4-2.3.0")
implementation("com.baidubce:qianfan:0.1.1")
@@ -61,6 +63,7 @@ dependencies {
implementation("org.jetbrains.kotlin:kotlin-reflect")
// implementation("org.jetbrains.kotlin:kotlin-scripting-jsr223")
testImplementation("org.springframework.boot:spring-boot-starter-test")
testImplementation("org.jetbrains.kotlin:kotlin-test")
}

View File

@@ -3,8 +3,6 @@ package com.lanyuanxiaoyao.digtal.market
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
import com.lanyuanxiaoyao.digtal.market.runner.PushRunner
import com.lanyuanxiaoyao.squirrel.core.common.Management
import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH
import com.lanyuanxiaoyao.squirrel.core.jvm.DRIVER_PATH
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
import com.lanyuanxiaoyao.squirrel.core.jvm.LocalFileDatabase
import jakarta.annotation.Resource
@@ -20,7 +18,6 @@ import org.springframework.context.ApplicationListener
import org.springframework.context.annotation.Bean
import org.springframework.context.annotation.Configuration
import org.springframework.context.event.ContextClosedEvent
import org.springframework.scheduling.annotation.EnableScheduling
import org.springframework.web.servlet.config.annotation.CorsRegistry
import org.springframework.web.servlet.config.annotation.WebMvcConfigurer
@@ -41,7 +38,7 @@ data class MailProperties @ConstructorBinding constructor(
val targets: List<String>,
)
@EnableScheduling
// @EnableScheduling
@OptIn(ExperimentalStdlibApi::class)
@ConfigurationPropertiesScan
@SpringBootApplication
@@ -73,7 +70,7 @@ class Configuration {
@Bean
fun jvmManagement(
driverProperties: DriverProperties,
databaseProperties: DatabaseProperties
databaseProperties: DatabaseProperties,
): Management = JvmManagement(
database = LocalFileDatabase(databaseProperties.jsonPath)
).also { management ->
@@ -84,13 +81,7 @@ class Configuration {
management.changeDownloader("basicCacheDownloader")
}
if (information.browserDownloaderName.isBlank()) {
management.changeDownloader(
"chromeCacheDownloader",
mapOf(
BINARY_PATH to driverProperties.binaryPath,
DRIVER_PATH to driverProperties.driverPath,
)
)
management.changeDownloader("htmlUnitCacheDownloader")
}
}
management.importSites(sites)

View File

@@ -1,3 +1,5 @@
@file:Suppress("NonAsciiCharacters", "ObjectPropertyName")
package com.lanyuanxiaoyao.digtal.market
import com.lanyuanxiaoyao.squirrel.core.common.Content
@@ -25,236 +27,288 @@ private val titleScript = Script(
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
)
val sites = setOf(
Site(
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
name = "广东政务服务和数据管理局",
home = "https://zfsg.gd.gov.cn",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.BROWSER,
tags = mapOf(
"数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html",
"省局要" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html",
"动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html",
"媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html",
"务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html",
"政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html",
),
rules = mapOf(
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule(
list = Content(
expression = "ul.newList > li",
title = Selector(".til > a"),
dateTime = Selector(".time"),
link = Selector(".til > a", "href"),
),
next = Selector(".page > a.next", "href")
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule(
text = Content(
expression = ".Con",
title = Selector("h3.zw-title"),
author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
content = Selector(".zw"),
extra = mapOf(
"source" to Selector(".zw", properties = mapOf(html))
)
)
),
// language=regexp
"https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
text = Content(
expression = "#page-content",
title = Selector("#activity-name"),
author = Selector("#js_name"),
dateTime = Selector("#publish_time"),
content = Selector("#js_content"),
extra = mapOf(
"source" to Selector("#js_content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
text = Content(
expression = "#article-container",
title = Selector("#article-title"),
author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))),
dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))),
content = Selector("#article-content"),
extra = mapOf(
"source" to Selector("#article-content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
text = Content(
expression = "body",
title = Selector(".title-page .txt > span"),
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".content"),
extra = mapOf(
"source" to Selector(".content", properties = mapOf(html))
)
)
),
)
private val 广东政务服务和数据管理局 = Site(
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
name = "广东政务服务和数据管理局",
home = "https://zfsg.gd.gov.cn",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.BROWSER,
tags = mapOf(
"数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html",
"省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html",
"动态新" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html",
"媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html",
"政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html",
"策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html",
),
Site(
code = "189a0e12-9319-4a4b-bf3b-34a6282e6f68",
name = "深圳市政务服务和数据管理局",
home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
icon = "https://www.sz.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
tags = mapOf(
"工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1",
rules = mapOf(
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule(
list = Content(
expression = "ul.newList > li",
title = Selector(".til > a"),
dateTime = Selector(".time"),
link = Selector(".til > a", "href"),
),
next = Selector(".page > a.next", "href")
),
rules = mapOf(
// language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule(
parser = Parser.Type.JSON,
list = Content(
expression = "$.articles",
title = Selector("$.title"),
dateTime = Selector(
"$.first_publish_time",
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
script = "let datetime = new Date(text * 1000)\nreturn `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`"
)
)
)
),
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
),
next = Selector(
"$.total",
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule(
text = Content(
expression = ".Con",
title = Selector("h3.zw-title"),
author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
content = Selector(".zw"),
extra = mapOf(
"source" to Selector(".zw", properties = mapOf(html))
)
)
),
// language=regexp
"https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
text = Content(
expression = "#page-content",
title = Selector("#activity-name"),
author = Selector("#js_name"),
dateTime = Selector("#publish_time"),
content = Selector("#js_content"),
extra = mapOf(
"source" to Selector("#js_content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
text = Content(
expression = "#article-container",
title = Selector("#article-title"),
author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))),
dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))),
content = Selector("#article-content"),
extra = mapOf(
"source" to Selector("#article-content", properties = mapOf(html))
)
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
text = Content(
expression = "body",
title = Selector(".title-page .txt > span"),
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".content"),
extra = mapOf(
"source" to Selector(".content", properties = mapOf(html))
)
)
),
)
)
private val 深圳市政务服务和数据管理局 = Site(
code = "189a0e12-9319-4a4b-bf3b-34a6282e6f68",
name = "深圳市政务服务和数据管理局",
home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
icon = "https://www.sz.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
tags = mapOf(
"工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1",
),
rules = mapOf(
// language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule(
parser = Parser.Type.JSON,
list = Content(
expression = "$.articles",
title = Selector("$.title"),
dateTime = Selector(
"$.first_publish_time",
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n"
script = "let datetime = new Date(text * 1000)\nreturn `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`"
)
)
)
)
),
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
),
// language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
next = Selector(
"$.total",
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n"
)
)
)
)
),
// language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
)
)
)
private val 中华人民共和国中央人民政府 = Site(
code = "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
name = "中华人民共和国中央人民政府",
home = "https://www.gov.cn",
icon = "https://www.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
tags = mapOf(
"搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D",
"搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D",
"搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D",
"搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D",
"搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D",
"搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D",
),
rules = mapOf(
// language=regexp
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
parser = Parser.Type.JSON,
headers = mapOf(
"Content-Type" to "application/json",
"Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
"Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
),
properties = mapOf("post" to "true"),
list = Content(
expression = "$.result.data.middle.list",
title = Selector("$.title_no_tag"),
dateTime = Selector("$.time"),
link = Selector("$.url"),
),
next = Selector(
expression = "$.result.data.pager",
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
script = "let query = {\n code: '17da70961a7',\n historySearchWords: [],\n dataTypeId: '107',\n orderBy: 'time',\n searchBy: 'title',\n appendixType: '',\n granularity: 'ALL',\n trackTotalHits: true,\n beginDateTime: '',\n endDateTime: '',\n isSearchForced: 0,\n filters: [],\n pageNo: 1,\n pageSize: 10,\n customFilter: {\n operator: 'and',\n properties: []\n },\n searchWord: '数据要素'\n}\nif (text && text !== '') {\n let pager = JSON.parse(text)\n let current = pager['pageNo']\n let count = pager['pageCount']\n if (current < count) {\n query.pageNo = current + 1\n } else {\n return ''\n }\n}\nreturn `https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=\${encodeURIComponent(JSON.stringify(query))}`\n"
)
)
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content .article",
title = Selector("h1#ti", process = Process(script = listOf(titleScript))),
author = Selector(
".pages-date > .font",
process = Process(
default = "中华人民共和国中央人民政府",
remove = listOf("来源:")
)
),
dateTime = Selector(
".pages-date",
process = Process(script = listOf(timeScript)),
properties = mapOf("precision" to "true")
),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".main-content",
title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))),
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("\\s*源:"))),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".qa_content_text"),
extra = mapOf(
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
)
)
),
)
)
private val 中华人民共和国国家互联网信息办公室 = Site(
code = "1df28c35-1e9e-4d58-9595-f08029b160b4",
name = "中华人民共和国国家互联网信息办公室",
home = "https://www.cac.gov.cn",
icon = "https://www.cac.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
tags = mapOf(
"搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
),
rules = mapOf(
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
list = Content(
expression = ".xpage-container .list-item",
title = Selector("a", process = Process(remove = listOf("\\s*»\\s*"))),
dateTime = Selector(".search_time"),
link = Selector("a", "href", process = Process(prefix = "https:"))
),
next = Selector(
".xpage-pagination .xpage-pagination-next a:contains(下一页)",
"href",
Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")
)
),
"https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule(
text = Content(
expression = ".main",
title = Selector("h1.title"),
author = Selector("#source", process = Process(remove = listOf("来源:"))),
dateTime = Selector("#pubtime"),
content = Selector(".main-content"),
extra = mapOf(
"source" to Selector(".main", properties = mapOf("html" to "true"))
)
)
)
),
Site(
code = "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
name = "中华人民共和国中央人民政府",
home = "https://www.gov.cn",
icon = "https://www.gov.cn/favicon.ico",
parser = Parser.Type.CSS,
author = "lanyuanxiaoyao",
target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP,
properties = mapOf(iframe),
tags = mapOf(
"搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D",
"搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D",
"搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D",
"搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D",
"搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D",
"搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D",
),
rules = mapOf(
// language=regexp
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
parser = Parser.Type.JSON,
headers = mapOf(
"Content-Type" to "application/json",
"Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
"Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
),
properties = mapOf("post" to "true"),
list = Content(
expression = "$.result.data.middle.list",
title = Selector("$.title_no_tag"),
dateTime = Selector("$.time"),
link = Selector("$.url"),
),
next = Selector(
expression = "$.result.data.pager",
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
script = "let query = {\n code: '17da70961a7',\n historySearchWords: [],\n dataTypeId: '107',\n orderBy: 'time',\n searchBy: 'title',\n appendixType: '',\n granularity: 'ALL',\n trackTotalHits: true,\n beginDateTime: '',\n endDateTime: '',\n isSearchForced: 0,\n filters: [],\n pageNo: 1,\n pageSize: 10,\n customFilter: {\n operator: 'and',\n properties: []\n },\n searchWord: '数据要素'\n}\nif (text && text !== '') {\n let pager = JSON.parse(text)\n let current = pager['pageNo']\n let count = pager['pageCount']\n if (current < count) {\n query.pageNo = current + 1\n } else {\n return ''\n }\n}\nreturn `https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=\${encodeURIComponent(JSON.stringify(query))}`\n"
)
)
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content .article",
title = Selector("h1#ti", process = Process(script = listOf(titleScript))),
author = Selector(
".pages-date > .font",
process = Process(
default = "中华人民共和国中央人民政府",
remove = listOf("来源:")
)
),
dateTime = Selector(
".pages-date",
process = Process(script = listOf(timeScript)),
properties = mapOf("precision" to "true")
),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".main-content",
title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))),
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("\\s*源:"))),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".qa_content_text"),
extra = mapOf(
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
)
)
),
)
),
)
)
val sites = setOf(
广东政务服务和数据管理局,
深圳市政务服务和数据管理局,
中华人民共和国中央人民政府,
中华人民共和国国家互联网信息办公室,
)

View File

@@ -125,7 +125,7 @@ class NewsRunner : Runner {
description = null,
score = null,
createTime = createTime,
pushed = null,
pushed = false,
)
} ?: return null
}
@@ -159,7 +159,7 @@ class NewsRunner : Runner {
}
}
next = page["next"] as String?
} while (recursive and !next.isNullOrBlank())
} while (recursive && !next.isNullOrBlank())
return links
}
}

View File

@@ -0,0 +1,48 @@
package com.lanyuanxiaoyao.digtal.market
import cn.hutool.json.JSONUtil
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
import com.lanyuanxiaoyao.squirrel.core.common.Management
import jakarta.annotation.Resource
import org.junit.jupiter.api.Test
import org.slf4j.LoggerFactory
import org.springframework.boot.test.context.SpringBootTest
@SpringBootTest(
classes = [Application::class],
webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT,
)
@ExperimentalStdlibApi
class TestManagement {
private val logger = LoggerFactory.getLogger(javaClass)
@Resource
private lateinit var management: Management
@Resource
private lateinit var newsRunner: NewsRunner
@Test
fun testParseList() {
newsRunner
.parseArticleLink(
"1df28c35-1e9e-4d58-9595-f08029b160b4",
"https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
true,
)
.forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) }
}
@Test
fun testParseArticle() {
newsRunner
.parseArticle(
"1df28c35-1e9e-4d58-9595-f08029b160b4",
"https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm",
"no title",
"no datetime",
"",
)
?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
}
}