feat: 优化站点规则
使用查询接口代替全部采集
This commit is contained in:
@@ -14,6 +14,7 @@ import com.lanyuanxiaoyao.squirrel.core.common.Site
|
||||
|
||||
private val html = Pair("html", "true")
|
||||
private val iframe = Pair("iframe", "true")
|
||||
private val post = Pair("post", "true")
|
||||
|
||||
private val timeScript = Script(
|
||||
Script.Type.Javascript,
|
||||
@@ -27,35 +28,62 @@ private val titleScript = Script(
|
||||
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
|
||||
)
|
||||
|
||||
// language=regexp
|
||||
private val commonRemove = listOf(
|
||||
"<.+?>",
|
||||
"&.+?;"
|
||||
)
|
||||
|
||||
private val 广东政务服务和数据管理局 = Site(
|
||||
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
name = "广东政务服务和数据管理局",
|
||||
home = "https://zfsg.gd.gov.cn",
|
||||
parser = Parser.Type.CSS,
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.TEXT,
|
||||
target = Site.Target.SEARCH,
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
tags = mapOf(
|
||||
"数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html",
|
||||
"省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html",
|
||||
"动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html",
|
||||
"媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html",
|
||||
"政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html",
|
||||
"政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html",
|
||||
),
|
||||
search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule(
|
||||
"https://search.gd.gov.cn/api/search/all.*" to Rule(
|
||||
downloader = Downloader.Type.HTTP,
|
||||
parser = Parser.Type.JSON,
|
||||
list = Content(
|
||||
expression = "ul.newList > li",
|
||||
title = Selector(".til > a"),
|
||||
dateTime = Selector(".time"),
|
||||
link = Selector(".til > a", "href"),
|
||||
expression = "$.data.news.list",
|
||||
title = Selector("$.title", process = Process(remove = commonRemove)),
|
||||
author = Selector("$.source"),
|
||||
dateTime = Selector("$.publish_time"),
|
||||
link = Selector("$.url"),
|
||||
),
|
||||
next = Selector(".page > a.next", "href")
|
||||
properties = mapOf(post),
|
||||
next = Selector(
|
||||
"$.data.news.total", process = Process(
|
||||
script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"let url = params['url']\nif ((url && url !== '') && (text && text !== '')) {\n let total = parseInt(text)\n let postData = JSON.parse(params['json'])\n let current = parseInt(postData['page'] ?? '1')\n if (current * 20 >= total) {\n return ''\n }\n postData['page'] = current + 1\n return 'https://search.gd.gov.cn/api/search/all?json=' + encodeURIComponent(JSON.stringify(postData))\n}\nreturn ''"
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule(
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".content-container",
|
||||
title = Selector(".content-box .content h1.title"),
|
||||
author = Selector("td.first:contains(发布机构) + td > span"),
|
||||
dateTime = Selector("td.second:contains(成文日期) + td > span"),
|
||||
content = Selector(".content .article-content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".content .article-content", properties = mapOf(html))
|
||||
),
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
|
||||
text = Content(
|
||||
expression = ".Con",
|
||||
title = Selector("h3.zw-title"),
|
||||
@@ -119,16 +147,23 @@ private val 深圳市政务服务和数据管理局 = Site(
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
properties = mapOf(iframe),
|
||||
tags = mapOf(
|
||||
"工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1",
|
||||
),
|
||||
search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule(
|
||||
"https*://search\\.gd\\.gov\\.cn/jsonp/site/755576\\?callback=getResult&page=\\d+&pagesize=20&text=.+" to Rule(
|
||||
parser = Parser.Type.JSON,
|
||||
preload = Process(
|
||||
script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"let getResult = result => result\nlet results = eval(text)\nreturn JSON.stringify(results)"
|
||||
)
|
||||
)
|
||||
),
|
||||
list = Content(
|
||||
expression = "$.articles",
|
||||
title = Selector("$.title"),
|
||||
expression = "$.results",
|
||||
title = Selector("$.title", process = Process(remove = commonRemove)),
|
||||
dateTime = Selector(
|
||||
"$.first_publish_time",
|
||||
process = Process(
|
||||
@@ -144,13 +179,13 @@ private val 深圳市政务服务和数据管理局 = Site(
|
||||
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
|
||||
),
|
||||
next = Selector(
|
||||
"$.total",
|
||||
"$.count",
|
||||
process = Process(
|
||||
script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n"
|
||||
script = "let url = params['url']\nif (url && url !== '') {\n let current = parseInt(params['page'] ?? '1')\n let total = parseInt(text)\n if (current * 20 >= total) {\n return ''\n }\n return url.replace(/page=\\d+/, 'page=' + (current + 1))\n}\nreturn ''"
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -183,14 +218,7 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
properties = mapOf(iframe),
|
||||
tags = mapOf(
|
||||
"搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D",
|
||||
"搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D",
|
||||
"搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D",
|
||||
"搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D",
|
||||
"搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D",
|
||||
"搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D",
|
||||
),
|
||||
search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
|
||||
rules = mapOf(
|
||||
// language=regexp
|
||||
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
|
||||
@@ -270,13 +298,7 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
|
||||
author = "lanyuanxiaoyao",
|
||||
target = Site.Target.TEXT,
|
||||
downloader = Downloader.Type.HTTP,
|
||||
tags = mapOf(
|
||||
"搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
"搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
"搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
"搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
"搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
),
|
||||
search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
rules = mapOf(
|
||||
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
|
||||
list = Content(
|
||||
@@ -312,3 +334,12 @@ val sites = setOf(
|
||||
中华人民共和国中央人民政府,
|
||||
中华人民共和国国家互联网信息办公室,
|
||||
)
|
||||
|
||||
val keywords = setOf(
|
||||
"数据要素",
|
||||
"数据安全",
|
||||
"数据跨境",
|
||||
"数据交易",
|
||||
"数据基础设施",
|
||||
"国家数据局",
|
||||
)
|
||||
|
||||
@@ -5,6 +5,7 @@ import cn.hutool.core.util.NumberUtil
|
||||
import cn.hutool.crypto.SecureUtil
|
||||
import com.lanyuanxiaoyao.digtal.market.Article
|
||||
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
||||
import com.lanyuanxiaoyao.digtal.market.keywords
|
||||
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
|
||||
import com.lanyuanxiaoyao.digtal.market.sites
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||
@@ -18,6 +19,7 @@ import java.util.concurrent.atomic.AtomicLong
|
||||
import kotlin.time.Duration.Companion.seconds
|
||||
import kotlin.time.toJavaDuration
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.jsoup.Jsoup
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.springframework.scheduling.annotation.Scheduled
|
||||
import org.springframework.stereotype.Service
|
||||
@@ -30,6 +32,7 @@ class NewsRunner : Runner {
|
||||
.builder<Any>()
|
||||
.withDelay(10.seconds.toJavaDuration())
|
||||
.withMaxRetries(2)
|
||||
.handleIf { e -> (e is PageParseException).not() }
|
||||
.build()
|
||||
|
||||
@Resource
|
||||
@@ -45,8 +48,9 @@ class NewsRunner : Runner {
|
||||
override fun run() {
|
||||
sites.forEach { site ->
|
||||
logger.info("站点: {}", site.name)
|
||||
site.tags.forEach { (tag, url) ->
|
||||
logger.info("类目: {}, 地址: {}", tag, url)
|
||||
keywords.forEach { keyword ->
|
||||
val url = site.search.replace("{query}", keyword)
|
||||
logger.info("类目: {}, 地址: {}", keyword, url)
|
||||
val hashList = articleRepository.findAllId()
|
||||
val links = parseArticleLink(site.code, url, false)
|
||||
val total = links.size
|
||||
@@ -72,12 +76,13 @@ class NewsRunner : Runner {
|
||||
article.score = triple?.third
|
||||
}
|
||||
|
||||
article.category = tag
|
||||
article.category = keyword
|
||||
articleRepository.save(article)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("本轮采集完成")
|
||||
}
|
||||
|
||||
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
|
||||
@@ -94,7 +99,7 @@ class NewsRunner : Runner {
|
||||
}
|
||||
})
|
||||
} catch (e: FailsafeException) {
|
||||
e.printStackTrace()
|
||||
logger.error("Parse failure", e)
|
||||
return null
|
||||
}
|
||||
(page["text"] as? Map<*, *>)?.let { text ->
|
||||
@@ -112,6 +117,16 @@ class NewsRunner : Runner {
|
||||
logger.error("Parse ${text["datetime"]} error", e)
|
||||
null
|
||||
}
|
||||
val source = text["source"]?.let {
|
||||
val document = Jsoup.parse((it as String))
|
||||
document
|
||||
.select("script")
|
||||
.forEach { node -> node.remove() }
|
||||
document.forEachNode { node -> node.removeAttr("style") }
|
||||
document
|
||||
.body()
|
||||
.html()
|
||||
}
|
||||
return Article(
|
||||
id = hash,
|
||||
code = code,
|
||||
@@ -120,7 +135,7 @@ class NewsRunner : Runner {
|
||||
author = text["author"] as String?,
|
||||
category = null,
|
||||
text = if ((text["content"] as String?) == null) null else text["content"] as String,
|
||||
html = text["source"] as String?,
|
||||
html = source,
|
||||
subtitle = null,
|
||||
description = null,
|
||||
score = null,
|
||||
@@ -143,7 +158,8 @@ class NewsRunner : Runner {
|
||||
.get(CheckedSupplier {
|
||||
runBlocking { management.parse(code, next!!) }
|
||||
})
|
||||
} catch (e: PageParseException) {
|
||||
} catch (e: FailsafeException) {
|
||||
logger.error("Parse failure", e)
|
||||
continue
|
||||
}
|
||||
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
|
||||
|
||||
@@ -4,6 +4,7 @@ import cn.hutool.json.JSONUtil
|
||||
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
|
||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||
import jakarta.annotation.Resource
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.springframework.boot.test.context.SpringBootTest
|
||||
@@ -22,23 +23,53 @@ class TestManagement {
|
||||
@Resource
|
||||
private lateinit var newsRunner: NewsRunner
|
||||
|
||||
@Test
|
||||
fun testParse() {
|
||||
newsRunner
|
||||
.parseArticleLink(
|
||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||
)
|
||||
.forEach { link ->
|
||||
val article = newsRunner.parseArticle(
|
||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
link.url,
|
||||
link.title,
|
||||
link.datetime,
|
||||
"",
|
||||
)
|
||||
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testDownload() {
|
||||
val page = runBlocking {
|
||||
management.download(
|
||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
|
||||
)
|
||||
}
|
||||
logger.info("Page: {}", page)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testParseList() {
|
||||
newsRunner
|
||||
.parseArticleLink(
|
||||
"1df28c35-1e9e-4d58-9595-f08029b160b4",
|
||||
"https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||
true,
|
||||
)
|
||||
.forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) }
|
||||
.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testParseArticle() {
|
||||
newsRunner
|
||||
.parseArticle(
|
||||
"1df28c35-1e9e-4d58-9595-f08029b160b4",
|
||||
"https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm",
|
||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
|
||||
"no title",
|
||||
"no datetime",
|
||||
"",
|
||||
|
||||
@@ -48,3 +48,23 @@ Content-Type: application/json
|
||||
"你好,你能帮我做什么"
|
||||
]
|
||||
}
|
||||
|
||||
### Search
|
||||
GET https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=2&pagesize=20&text=数据要素&order=1&position=all
|
||||
|
||||
### Search
|
||||
POST https://search.gd.gov.cn/api/search/all
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"page": 1,
|
||||
"keywords": "数据要素",
|
||||
"advance": "true",
|
||||
"sort": "time",
|
||||
"position": "all",
|
||||
"time_to": 2524579200,
|
||||
"time_from": 189273600,
|
||||
"site_id": "246",
|
||||
"range": "site",
|
||||
"recommand": 1
|
||||
}
|
||||
Reference in New Issue
Block a user