From 5829b6e1456cfaf61c21b0a53f49d7b16ba4bcf5 Mon Sep 17 00:00:00 2001 From: lanyuanxiaoyao Date: Mon, 4 Nov 2024 09:23:38 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0spring=E7=8E=AF?= =?UTF-8?q?=E5=A2=83=E7=9A=84=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle.kts | 5 +- .../digtal/market/Application.kt | 15 +- .../com/lanyuanxiaoyao/digtal/market/Sites.kt | 490 ++++++++++-------- .../digtal/market/runner/NewsRunner.kt | 4 +- .../digtal/market/TestManagement.kt | 48 ++ 5 files changed, 329 insertions(+), 233 deletions(-) create mode 100644 src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt diff --git a/build.gradle.kts b/build.gradle.kts index 83e8e59..c0734dc 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -34,7 +34,9 @@ repositories { } dependencies { - implementation("com.lanyuanxiaoyao:squirrel-core-jvm:1.0.0-SNAPSHOT") + implementation("com.lanyuanxiaoyao:squirrel-core-jvm:1.0.0-SNAPSHOT") { + exclude("commons-logging", "commons-logging") + } implementation("dev.failsafe:failsafe:3.3.2") implementation("cn.bigmodel.openapi:oapi-java-sdk:release-V4-2.3.0") implementation("com.baidubce:qianfan:0.1.1") @@ -61,6 +63,7 @@ dependencies { implementation("org.jetbrains.kotlin:kotlin-reflect") // implementation("org.jetbrains.kotlin:kotlin-scripting-jsr223") + testImplementation("org.springframework.boot:spring-boot-starter-test") testImplementation("org.jetbrains.kotlin:kotlin-test") } diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Application.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Application.kt index fa26267..1406b43 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Application.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Application.kt @@ -3,8 +3,6 @@ package com.lanyuanxiaoyao.digtal.market import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner import com.lanyuanxiaoyao.digtal.market.runner.PushRunner import com.lanyuanxiaoyao.squirrel.core.common.Management -import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH -import com.lanyuanxiaoyao.squirrel.core.jvm.DRIVER_PATH import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement import com.lanyuanxiaoyao.squirrel.core.jvm.LocalFileDatabase import jakarta.annotation.Resource @@ -20,7 +18,6 @@ import org.springframework.context.ApplicationListener import org.springframework.context.annotation.Bean import org.springframework.context.annotation.Configuration import org.springframework.context.event.ContextClosedEvent -import org.springframework.scheduling.annotation.EnableScheduling import org.springframework.web.servlet.config.annotation.CorsRegistry import org.springframework.web.servlet.config.annotation.WebMvcConfigurer @@ -41,7 +38,7 @@ data class MailProperties @ConstructorBinding constructor( val targets: List, ) -@EnableScheduling +// @EnableScheduling @OptIn(ExperimentalStdlibApi::class) @ConfigurationPropertiesScan @SpringBootApplication @@ -73,7 +70,7 @@ class Configuration { @Bean fun jvmManagement( driverProperties: DriverProperties, - databaseProperties: DatabaseProperties + databaseProperties: DatabaseProperties, ): Management = JvmManagement( database = LocalFileDatabase(databaseProperties.jsonPath) ).also { management -> @@ -84,13 +81,7 @@ class Configuration { management.changeDownloader("basicCacheDownloader") } if (information.browserDownloaderName.isBlank()) { - management.changeDownloader( - "chromeCacheDownloader", - mapOf( - BINARY_PATH to driverProperties.binaryPath, - DRIVER_PATH to driverProperties.driverPath, - ) - ) + management.changeDownloader("htmlUnitCacheDownloader") } } management.importSites(sites) diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt index 9ce0e36..2c53567 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt @@ -1,3 +1,5 @@ +@file:Suppress("NonAsciiCharacters", "ObjectPropertyName") + package com.lanyuanxiaoyao.digtal.market import com.lanyuanxiaoyao.squirrel.core.common.Content @@ -25,236 +27,288 @@ private val titleScript = Script( script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}", ) -val sites = setOf( - Site( - code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", - name = "广东政务服务和数据管理局", - home = "https://zfsg.gd.gov.cn", - parser = Parser.Type.CSS, - author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.BROWSER, - tags = mapOf( - "数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html", - "省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html", - "动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html", - "媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html", - "政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html", - "政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html", - ), - rules = mapOf( - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule( - list = Content( - expression = "ul.newList > li", - title = Selector(".til > a"), - dateTime = Selector(".time"), - link = Selector(".til > a", "href"), - ), - next = Selector(".page > a.next", "href") - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule( - text = Content( - expression = ".Con", - title = Selector("h3.zw-title"), - author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))), - dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))), - content = Selector(".zw"), - extra = mapOf( - "source" to Selector(".zw", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https://mp\\.weixin\\.qq\\.com/s/.+" to Rule( - text = Content( - expression = "#page-content", - title = Selector("#activity-name"), - author = Selector("#js_name"), - dateTime = Selector("#publish_time"), - content = Selector("#js_content"), - extra = mapOf( - "source" to Selector("#js_content", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule( - text = Content( - expression = "#article-container", - title = Selector("#article-title"), - author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))), - dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))), - content = Selector("#article-content"), - extra = mapOf( - "source" to Selector("#article-content", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule( - text = Content( - expression = "body", - title = Selector(".title-page .txt > span"), - author = Selector(process = Process(default = "广东政务服务和数据管理局")), - dateTime = Selector(process = Process(script = listOf(timeScript))), - content = Selector(".content"), - extra = mapOf( - "source" to Selector(".content", properties = mapOf(html)) - ) - ) - ), - ) +private val 广东政务服务和数据管理局 = Site( + code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + name = "广东政务服务和数据管理局", + home = "https://zfsg.gd.gov.cn", + parser = Parser.Type.CSS, + author = "lanyuanxiaoyao", + target = Site.Target.TEXT, + downloader = Downloader.Type.BROWSER, + tags = mapOf( + "数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html", + "省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html", + "动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html", + "媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html", + "政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html", + "政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html", ), - Site( - code = "189a0e12-9319-4a4b-bf3b-34a6282e6f68", - name = "深圳市政务服务和数据管理局", - home = "https://www.sz.gov.cn/szzsj/gkmlpt/index", - icon = "https://www.sz.gov.cn/favicon.ico", - parser = Parser.Type.CSS, - author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.HTTP, - properties = mapOf(iframe), - tags = mapOf( - "工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1", + rules = mapOf( + // language=regexp + "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule( + list = Content( + expression = "ul.newList > li", + title = Selector(".til > a"), + dateTime = Selector(".time"), + link = Selector(".til > a", "href"), + ), + next = Selector(".page > a.next", "href") ), - rules = mapOf( - // language=regexp - "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule( - parser = Parser.Type.JSON, - list = Content( - expression = "$.articles", - title = Selector("$.title"), - dateTime = Selector( - "$.first_publish_time", - process = Process( - script = listOf( - Script( - Script.Type.Javascript, - // language=javascript - script = "let datetime = new Date(text * 1000)\nreturn `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`" - ) - ) - ) - ), - link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))), - ), - next = Selector( - "$.total", + // language=regexp + "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule( + text = Content( + expression = ".Con", + title = Selector("h3.zw-title"), + author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))), + dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))), + content = Selector(".zw"), + extra = mapOf( + "source" to Selector(".zw", properties = mapOf(html)) + ) + ) + ), + // language=regexp + "https://mp\\.weixin\\.qq\\.com/s/.+" to Rule( + text = Content( + expression = "#page-content", + title = Selector("#activity-name"), + author = Selector("#js_name"), + dateTime = Selector("#publish_time"), + content = Selector("#js_content"), + extra = mapOf( + "source" to Selector("#js_content", properties = mapOf(html)) + ) + ) + ), + // language=regexp + "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule( + text = Content( + expression = "#article-container", + title = Selector("#article-title"), + author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))), + dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))), + content = Selector("#article-content"), + extra = mapOf( + "source" to Selector("#article-content", properties = mapOf(html)) + ) + ) + ), + // language=regexp + "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule( + text = Content( + expression = "body", + title = Selector(".title-page .txt > span"), + author = Selector(process = Process(default = "广东政务服务和数据管理局")), + dateTime = Selector(process = Process(script = listOf(timeScript))), + content = Selector(".content"), + extra = mapOf( + "source" to Selector(".content", properties = mapOf(html)) + ) + ) + ), + ) +) + +private val 深圳市政务服务和数据管理局 = Site( + code = "189a0e12-9319-4a4b-bf3b-34a6282e6f68", + name = "深圳市政务服务和数据管理局", + home = "https://www.sz.gov.cn/szzsj/gkmlpt/index", + icon = "https://www.sz.gov.cn/favicon.ico", + parser = Parser.Type.CSS, + author = "lanyuanxiaoyao", + target = Site.Target.TEXT, + downloader = Downloader.Type.HTTP, + properties = mapOf(iframe), + tags = mapOf( + "工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1", + ), + rules = mapOf( + // language=regexp + "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule( + parser = Parser.Type.JSON, + list = Content( + expression = "$.articles", + title = Selector("$.title"), + dateTime = Selector( + "$.first_publish_time", process = Process( script = listOf( Script( Script.Type.Javascript, // language=javascript - script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n" + script = "let datetime = new Date(text * 1000)\nreturn `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`" ) ) ) - ) + ), + link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))), ), - // language=regexp - "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".content-container", - title = Selector(".content-box .content h1.title"), - author = Selector("td.first:contains(发布机构) + td > span"), - dateTime = Selector("td.second:contains(成文日期) + td > span"), - content = Selector(".content .article-content"), - extra = mapOf( - "source" to Selector(".content .article-content", properties = mapOf(html)) - ), + next = Selector( + "$.total", + process = Process( + script = listOf( + Script( + Script.Type.Javascript, + // language=javascript + script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n" + ) + ) + ) + ) + ), + // language=regexp + "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule( + downloader = Downloader.Type.BROWSER, + text = Content( + expression = ".content-container", + title = Selector(".content-box .content h1.title"), + author = Selector("td.first:contains(发布机构) + td > span"), + dateTime = Selector("td.second:contains(成文日期) + td > span"), + content = Selector(".content .article-content"), + extra = mapOf( + "source" to Selector(".content .article-content", properties = mapOf(html)) + ), + ) + ) + ) +) + +private val 中华人民共和国中央人民政府 = Site( + code = "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be", + name = "中华人民共和国中央人民政府", + home = "https://www.gov.cn", + icon = "https://www.gov.cn/favicon.ico", + parser = Parser.Type.CSS, + author = "lanyuanxiaoyao", + target = Site.Target.TEXT, + downloader = Downloader.Type.HTTP, + properties = mapOf(iframe), + tags = mapOf( + "搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D", + "搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D", + "搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D", + "搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D", + "搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D", + "搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D", + ), + rules = mapOf( + // language=regexp + "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( + parser = Parser.Type.JSON, + headers = mapOf( + "Content-Type" to "application/json", + "Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D", + "Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2", + ), + properties = mapOf("post" to "true"), + list = Content( + expression = "$.result.data.middle.list", + title = Selector("$.title_no_tag"), + dateTime = Selector("$.time"), + link = Selector("$.url"), + ), + next = Selector( + expression = "$.result.data.pager", + process = Process( + script = listOf( + Script( + Script.Type.Javascript, + // language=javascript + script = "let query = {\n code: '17da70961a7',\n historySearchWords: [],\n dataTypeId: '107',\n orderBy: 'time',\n searchBy: 'title',\n appendixType: '',\n granularity: 'ALL',\n trackTotalHits: true,\n beginDateTime: '',\n endDateTime: '',\n isSearchForced: 0,\n filters: [],\n pageNo: 1,\n pageSize: 10,\n customFilter: {\n operator: 'and',\n properties: []\n },\n searchWord: '数据要素'\n}\nif (text && text !== '') {\n let pager = JSON.parse(text)\n let current = pager['pageNo']\n let count = pager['pageCount']\n if (current < count) {\n query.pageNo = current + 1\n } else {\n return ''\n }\n}\nreturn `https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=\${encodeURIComponent(JSON.stringify(query))}`\n" + ) + ) + ) + ) + ), + // language=regexp + "https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule( + downloader = Downloader.Type.BROWSER, + text = Content( + expression = ".content .article", + title = Selector("h1#ti", process = Process(script = listOf(titleScript))), + author = Selector( + ".pages-date > .font", + process = Process( + default = "中华人民共和国中央人民政府", + remove = listOf("来源:") + ) + ), + dateTime = Selector( + ".pages-date", + process = Process(script = listOf(timeScript)), + properties = mapOf("precision" to "true") + ), + content = Selector(".pages_content"), + extra = mapOf( + "source" to Selector(".pages_content", properties = mapOf("html" to "true")) + ) + ) + ), + // language=regexp + "https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule( + downloader = Downloader.Type.BROWSER, + text = Content( + expression = ".main-content", + title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))), + author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))), + dateTime = Selector(process = Process(script = listOf(timeScript))), + content = Selector(".qa_content_text"), + extra = mapOf( + "source" to Selector(".main-content", properties = mapOf("html" to "true")) + ) + ) + ), + ) +) + +private val 中华人民共和国国家互联网信息办公室 = Site( + code = "1df28c35-1e9e-4d58-9595-f08029b160b4", + name = "中华人民共和国国家互联网信息办公室", + home = "https://www.cac.gov.cn", + icon = "https://www.cac.gov.cn/favicon.ico", + parser = Parser.Type.CSS, + author = "lanyuanxiaoyao", + target = Site.Target.TEXT, + downloader = Downloader.Type.HTTP, + tags = mapOf( + "搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + "搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + "搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + "搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + "搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + ), + rules = mapOf( + "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule( + list = Content( + expression = ".xpage-container .list-item", + title = Selector("a", process = Process(remove = listOf("\\s*»\\s*"))), + dateTime = Selector(".search_time"), + link = Selector("a", "href", process = Process(prefix = "https:")) + ), + next = Selector( + ".xpage-pagination .xpage-pagination-next a:contains(下一页)", + "href", + Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/") + ) + ), + "https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule( + text = Content( + expression = ".main", + title = Selector("h1.title"), + author = Selector("#source", process = Process(remove = listOf("来源:"))), + dateTime = Selector("#pubtime"), + content = Selector(".main-content"), + extra = mapOf( + "source" to Selector(".main", properties = mapOf("html" to "true")) ) ) ) - ), - Site( - code = "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be", - name = "中华人民共和国中央人民政府", - home = "https://www.gov.cn", - icon = "https://www.gov.cn/favicon.ico", - parser = Parser.Type.CSS, - author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.HTTP, - properties = mapOf(iframe), - tags = mapOf( - "搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D", - "搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D", - "搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D", - "搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D", - "搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D", - "搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D", - ), - rules = mapOf( - // language=regexp - "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( - parser = Parser.Type.JSON, - headers = mapOf( - "Content-Type" to "application/json", - "Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D", - "Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2", - ), - properties = mapOf("post" to "true"), - list = Content( - expression = "$.result.data.middle.list", - title = Selector("$.title_no_tag"), - dateTime = Selector("$.time"), - link = Selector("$.url"), - ), - next = Selector( - expression = "$.result.data.pager", - process = Process( - script = listOf( - Script( - Script.Type.Javascript, - // language=javascript - script = "let query = {\n code: '17da70961a7',\n historySearchWords: [],\n dataTypeId: '107',\n orderBy: 'time',\n searchBy: 'title',\n appendixType: '',\n granularity: 'ALL',\n trackTotalHits: true,\n beginDateTime: '',\n endDateTime: '',\n isSearchForced: 0,\n filters: [],\n pageNo: 1,\n pageSize: 10,\n customFilter: {\n operator: 'and',\n properties: []\n },\n searchWord: '数据要素'\n}\nif (text && text !== '') {\n let pager = JSON.parse(text)\n let current = pager['pageNo']\n let count = pager['pageCount']\n if (current < count) {\n query.pageNo = current + 1\n } else {\n return ''\n }\n}\nreturn `https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=\${encodeURIComponent(JSON.stringify(query))}`\n" - ) - ) - ) - ) - ), - // language=regexp - "https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".content .article", - title = Selector("h1#ti", process = Process(script = listOf(titleScript))), - author = Selector( - ".pages-date > .font", - process = Process( - default = "中华人民共和国中央人民政府", - remove = listOf("来源:") - ) - ), - dateTime = Selector( - ".pages-date", - process = Process(script = listOf(timeScript)), - properties = mapOf("precision" to "true") - ), - content = Selector(".pages_content"), - extra = mapOf( - "source" to Selector(".pages_content", properties = mapOf("html" to "true")) - ) - ) - ), - // language=regexp - "https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".main-content", - title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))), - author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))), - dateTime = Selector(process = Process(script = listOf(timeScript))), - content = Selector(".qa_content_text"), - extra = mapOf( - "source" to Selector(".main-content", properties = mapOf("html" to "true")) - ) - ) - ), - ) - ), + ) +) + +val sites = setOf( + 广东政务服务和数据管理局, + 深圳市政务服务和数据管理局, + 中华人民共和国中央人民政府, + 中华人民共和国国家互联网信息办公室, ) diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt index fcd14de..4f28747 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt @@ -125,7 +125,7 @@ class NewsRunner : Runner { description = null, score = null, createTime = createTime, - pushed = null, + pushed = false, ) } ?: return null } @@ -159,7 +159,7 @@ class NewsRunner : Runner { } } next = page["next"] as String? - } while (recursive and !next.isNullOrBlank()) + } while (recursive && !next.isNullOrBlank()) return links } } diff --git a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt new file mode 100644 index 0000000..6ecd8fb --- /dev/null +++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt @@ -0,0 +1,48 @@ +package com.lanyuanxiaoyao.digtal.market + +import cn.hutool.json.JSONUtil +import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner +import com.lanyuanxiaoyao.squirrel.core.common.Management +import jakarta.annotation.Resource +import org.junit.jupiter.api.Test +import org.slf4j.LoggerFactory +import org.springframework.boot.test.context.SpringBootTest + +@SpringBootTest( + classes = [Application::class], + webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT, +) +@ExperimentalStdlibApi +class TestManagement { + private val logger = LoggerFactory.getLogger(javaClass) + + @Resource + private lateinit var management: Management + + @Resource + private lateinit var newsRunner: NewsRunner + + @Test + fun testParseList() { + newsRunner + .parseArticleLink( + "1df28c35-1e9e-4d58-9595-f08029b160b4", + "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + true, + ) + .forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) } + } + + @Test + fun testParseArticle() { + newsRunner + .parseArticle( + "1df28c35-1e9e-4d58-9595-f08029b160b4", + "https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm", + "no title", + "no datetime", + "", + ) + ?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } + } +} \ No newline at end of file