1
0

feat: 优化站点规则

使用查询接口代替全部采集
This commit is contained in:
2024-11-04 18:48:10 +08:00
parent 0b08b1bfb6
commit b8c253d522
4 changed files with 148 additions and 50 deletions

View File

@@ -14,6 +14,7 @@ import com.lanyuanxiaoyao.squirrel.core.common.Site
private val html = Pair("html", "true") private val html = Pair("html", "true")
private val iframe = Pair("iframe", "true") private val iframe = Pair("iframe", "true")
private val post = Pair("post", "true")
private val timeScript = Script( private val timeScript = Script(
Script.Type.Javascript, Script.Type.Javascript,
@@ -27,35 +28,62 @@ private val titleScript = Script(
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}", script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
) )
// language=regexp
private val commonRemove = listOf(
"<.+?>",
"&.+?;"
)
private val 广东政务服务和数据管理局 = Site( private val 广东政务服务和数据管理局 = Site(
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
name = "广东政务服务和数据管理局", name = "广东政务服务和数据管理局",
home = "https://zfsg.gd.gov.cn", home = "https://zfsg.gd.gov.cn",
parser = Parser.Type.CSS, parser = Parser.Type.CSS,
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.TEXT, target = Site.Target.SEARCH,
downloader = Downloader.Type.BROWSER, downloader = Downloader.Type.BROWSER,
tags = mapOf( search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
"数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html",
"省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html",
"动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html",
"媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html",
"政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html",
"政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html",
),
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule( "https://search.gd.gov.cn/api/search/all.*" to Rule(
downloader = Downloader.Type.HTTP,
parser = Parser.Type.JSON,
list = Content( list = Content(
expression = "ul.newList > li", expression = "$.data.news.list",
title = Selector(".til > a"), title = Selector("$.title", process = Process(remove = commonRemove)),
dateTime = Selector(".time"), author = Selector("$.source"),
link = Selector(".til > a", "href"), dateTime = Selector("$.publish_time"),
link = Selector("$.url"),
), ),
next = Selector(".page > a.next", "href") properties = mapOf(post),
next = Selector(
"$.data.news.total", process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
"let url = params['url']\nif ((url && url !== '') && (text && text !== '')) {\n let total = parseInt(text)\n let postData = JSON.parse(params['json'])\n let current = parseInt(postData['page'] ?? '1')\n if (current * 20 >= total) {\n return ''\n }\n postData['page'] = current + 1\n return 'https://search.gd.gov.cn/api/search/all?json=' + encodeURIComponent(JSON.stringify(postData))\n}\nreturn ''"
)
)
)
)
), ),
// language=regexp // language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule( "https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content-container",
title = Selector(".content-box .content h1.title"),
author = Selector("td.first:contains(发布机构) + td > span"),
dateTime = Selector("td.second:contains(成文日期) + td > span"),
content = Selector(".content .article-content"),
extra = mapOf(
"source" to Selector(".content .article-content", properties = mapOf(html))
),
)
),
// language=regexp
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
text = Content( text = Content(
expression = ".Con", expression = ".Con",
title = Selector("h3.zw-title"), title = Selector("h3.zw-title"),
@@ -119,16 +147,23 @@ private val 深圳市政务服务和数据管理局 = Site(
target = Site.Target.TEXT, target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP, downloader = Downloader.Type.HTTP,
properties = mapOf(iframe), properties = mapOf(iframe),
tags = mapOf( search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
"工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1",
),
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule( "https*://search\\.gd\\.gov\\.cn/jsonp/site/755576\\?callback=getResult&page=\\d+&pagesize=20&text=.+" to Rule(
parser = Parser.Type.JSON, parser = Parser.Type.JSON,
preload = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
"let getResult = result => result\nlet results = eval(text)\nreturn JSON.stringify(results)"
)
)
),
list = Content( list = Content(
expression = "$.articles", expression = "$.results",
title = Selector("$.title"), title = Selector("$.title", process = Process(remove = commonRemove)),
dateTime = Selector( dateTime = Selector(
"$.first_publish_time", "$.first_publish_time",
process = Process( process = Process(
@@ -144,13 +179,13 @@ private val 深圳市政务服务和数据管理局 = Site(
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))), link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
), ),
next = Selector( next = Selector(
"$.total", "$.count",
process = Process( process = Process(
script = listOf( script = listOf(
Script( Script(
Script.Type.Javascript, Script.Type.Javascript,
// language=javascript // language=javascript
script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n" script = "let url = params['url']\nif (url && url !== '') {\n let current = parseInt(params['page'] ?? '1')\n let total = parseInt(text)\n if (current * 20 >= total) {\n return ''\n }\n return url.replace(/page=\\d+/, 'page=' + (current + 1))\n}\nreturn ''"
) )
) )
) )
@@ -183,14 +218,7 @@ private val 中华人民共和国中央人民政府 = Site(
target = Site.Target.TEXT, target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP, downloader = Downloader.Type.HTTP,
properties = mapOf(iframe), properties = mapOf(iframe),
tags = mapOf( search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
"搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D",
"搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D",
"搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D",
"搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D",
"搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D",
"搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D",
),
rules = mapOf( rules = mapOf(
// language=regexp // language=regexp
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
@@ -270,13 +298,7 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
author = "lanyuanxiaoyao", author = "lanyuanxiaoyao",
target = Site.Target.TEXT, target = Site.Target.TEXT,
downloader = Downloader.Type.HTTP, downloader = Downloader.Type.HTTP,
tags = mapOf( search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
"搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
),
rules = mapOf( rules = mapOf(
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule( "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
list = Content( list = Content(
@@ -312,3 +334,12 @@ val sites = setOf(
中华人民共和国中央人民政府, 中华人民共和国中央人民政府,
中华人民共和国国家互联网信息办公室, 中华人民共和国国家互联网信息办公室,
) )
val keywords = setOf(
"数据要素",
"数据安全",
"数据跨境",
"数据交易",
"数据基础设施",
"国家数据局",
)

View File

@@ -5,6 +5,7 @@ import cn.hutool.core.util.NumberUtil
import cn.hutool.crypto.SecureUtil import cn.hutool.crypto.SecureUtil
import com.lanyuanxiaoyao.digtal.market.Article import com.lanyuanxiaoyao.digtal.market.Article
import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.ArticleRepository
import com.lanyuanxiaoyao.digtal.market.keywords
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
import com.lanyuanxiaoyao.digtal.market.sites import com.lanyuanxiaoyao.digtal.market.sites
import com.lanyuanxiaoyao.squirrel.core.common.Management import com.lanyuanxiaoyao.squirrel.core.common.Management
@@ -18,6 +19,7 @@ import java.util.concurrent.atomic.AtomicLong
import kotlin.time.Duration.Companion.seconds import kotlin.time.Duration.Companion.seconds
import kotlin.time.toJavaDuration import kotlin.time.toJavaDuration
import kotlinx.coroutines.runBlocking import kotlinx.coroutines.runBlocking
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.springframework.scheduling.annotation.Scheduled import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Service import org.springframework.stereotype.Service
@@ -30,6 +32,7 @@ class NewsRunner : Runner {
.builder<Any>() .builder<Any>()
.withDelay(10.seconds.toJavaDuration()) .withDelay(10.seconds.toJavaDuration())
.withMaxRetries(2) .withMaxRetries(2)
.handleIf { e -> (e is PageParseException).not() }
.build() .build()
@Resource @Resource
@@ -45,8 +48,9 @@ class NewsRunner : Runner {
override fun run() { override fun run() {
sites.forEach { site -> sites.forEach { site ->
logger.info("站点: {}", site.name) logger.info("站点: {}", site.name)
site.tags.forEach { (tag, url) -> keywords.forEach { keyword ->
logger.info("类目: {}, 地址: {}", tag, url) val url = site.search.replace("{query}", keyword)
logger.info("类目: {}, 地址: {}", keyword, url)
val hashList = articleRepository.findAllId() val hashList = articleRepository.findAllId()
val links = parseArticleLink(site.code, url, false) val links = parseArticleLink(site.code, url, false)
val total = links.size val total = links.size
@@ -72,12 +76,13 @@ class NewsRunner : Runner {
article.score = triple?.third article.score = triple?.third
} }
article.category = tag article.category = keyword
articleRepository.save(article) articleRepository.save(article)
} }
} }
} }
} }
logger.info("本轮采集完成")
} }
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? { fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
@@ -94,7 +99,7 @@ class NewsRunner : Runner {
} }
}) })
} catch (e: FailsafeException) { } catch (e: FailsafeException) {
e.printStackTrace() logger.error("Parse failure", e)
return null return null
} }
(page["text"] as? Map<*, *>)?.let { text -> (page["text"] as? Map<*, *>)?.let { text ->
@@ -112,6 +117,16 @@ class NewsRunner : Runner {
logger.error("Parse ${text["datetime"]} error", e) logger.error("Parse ${text["datetime"]} error", e)
null null
} }
val source = text["source"]?.let {
val document = Jsoup.parse((it as String))
document
.select("script")
.forEach { node -> node.remove() }
document.forEachNode { node -> node.removeAttr("style") }
document
.body()
.html()
}
return Article( return Article(
id = hash, id = hash,
code = code, code = code,
@@ -120,7 +135,7 @@ class NewsRunner : Runner {
author = text["author"] as String?, author = text["author"] as String?,
category = null, category = null,
text = if ((text["content"] as String?) == null) null else text["content"] as String, text = if ((text["content"] as String?) == null) null else text["content"] as String,
html = text["source"] as String?, html = source,
subtitle = null, subtitle = null,
description = null, description = null,
score = null, score = null,
@@ -143,7 +158,8 @@ class NewsRunner : Runner {
.get(CheckedSupplier { .get(CheckedSupplier {
runBlocking { management.parse(code, next!!) } runBlocking { management.parse(code, next!!) }
}) })
} catch (e: PageParseException) { } catch (e: FailsafeException) {
logger.error("Parse failure", e)
continue continue
} }
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list -> @Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->

View File

@@ -4,6 +4,7 @@ import cn.hutool.json.JSONUtil
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
import com.lanyuanxiaoyao.squirrel.core.common.Management import com.lanyuanxiaoyao.squirrel.core.common.Management
import jakarta.annotation.Resource import jakarta.annotation.Resource
import kotlinx.coroutines.runBlocking
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.springframework.boot.test.context.SpringBootTest import org.springframework.boot.test.context.SpringBootTest
@@ -22,23 +23,53 @@ class TestManagement {
@Resource @Resource
private lateinit var newsRunner: NewsRunner private lateinit var newsRunner: NewsRunner
@Test
fun testParse() {
newsRunner
.parseArticleLink(
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
)
.forEach { link ->
val article = newsRunner.parseArticle(
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
link.url,
link.title,
link.datetime,
"",
)
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
}
}
@Test
fun testDownload() {
val page = runBlocking {
management.download(
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
)
}
logger.info("Page: {}", page)
}
@Test @Test
fun testParseList() { fun testParseList() {
newsRunner newsRunner
.parseArticleLink( .parseArticleLink(
"1df28c35-1e9e-4d58-9595-f08029b160b4", "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
"https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
true, true,
) )
.forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) } .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
} }
@Test @Test
fun testParseArticle() { fun testParseArticle() {
newsRunner newsRunner
.parseArticle( .parseArticle(
"1df28c35-1e9e-4d58-9595-f08029b160b4", "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
"https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm", "https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
"no title", "no title",
"no datetime", "no datetime",
"", "",

View File

@@ -48,3 +48,23 @@ Content-Type: application/json
"你好,你能帮我做什么" "你好,你能帮我做什么"
] ]
} }
### Search
GET https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=2&pagesize=20&text=数据要素&order=1&position=all
### Search
POST https://search.gd.gov.cn/api/search/all
Content-Type: application/json
{
"page": 1,
"keywords": "数据要素",
"advance": "true",
"sort": "time",
"position": "all",
"time_to": 2524579200,
"time_from": 189273600,
"site_id": "246",
"range": "site",
"recommand": 1
}