feat: 优化站点规则
使用查询接口代替全部采集
This commit is contained in:
@@ -14,6 +14,7 @@ import com.lanyuanxiaoyao.squirrel.core.common.Site
|
|||||||
|
|
||||||
private val html = Pair("html", "true")
|
private val html = Pair("html", "true")
|
||||||
private val iframe = Pair("iframe", "true")
|
private val iframe = Pair("iframe", "true")
|
||||||
|
private val post = Pair("post", "true")
|
||||||
|
|
||||||
private val timeScript = Script(
|
private val timeScript = Script(
|
||||||
Script.Type.Javascript,
|
Script.Type.Javascript,
|
||||||
@@ -27,35 +28,62 @@ private val titleScript = Script(
|
|||||||
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
|
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// language=regexp
|
||||||
|
private val commonRemove = listOf(
|
||||||
|
"<.+?>",
|
||||||
|
"&.+?;"
|
||||||
|
)
|
||||||
|
|
||||||
private val 广东政务服务和数据管理局 = Site(
|
private val 广东政务服务和数据管理局 = Site(
|
||||||
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
name = "广东政务服务和数据管理局",
|
name = "广东政务服务和数据管理局",
|
||||||
home = "https://zfsg.gd.gov.cn",
|
home = "https://zfsg.gd.gov.cn",
|
||||||
parser = Parser.Type.CSS,
|
parser = Parser.Type.CSS,
|
||||||
author = "lanyuanxiaoyao",
|
author = "lanyuanxiaoyao",
|
||||||
target = Site.Target.TEXT,
|
target = Site.Target.SEARCH,
|
||||||
downloader = Downloader.Type.BROWSER,
|
downloader = Downloader.Type.BROWSER,
|
||||||
tags = mapOf(
|
search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||||
"数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html",
|
|
||||||
"省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html",
|
|
||||||
"动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html",
|
|
||||||
"媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html",
|
|
||||||
"政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html",
|
|
||||||
"政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html",
|
|
||||||
),
|
|
||||||
rules = mapOf(
|
rules = mapOf(
|
||||||
// language=regexp
|
// language=regexp
|
||||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule(
|
"https://search.gd.gov.cn/api/search/all.*" to Rule(
|
||||||
|
downloader = Downloader.Type.HTTP,
|
||||||
|
parser = Parser.Type.JSON,
|
||||||
list = Content(
|
list = Content(
|
||||||
expression = "ul.newList > li",
|
expression = "$.data.news.list",
|
||||||
title = Selector(".til > a"),
|
title = Selector("$.title", process = Process(remove = commonRemove)),
|
||||||
dateTime = Selector(".time"),
|
author = Selector("$.source"),
|
||||||
link = Selector(".til > a", "href"),
|
dateTime = Selector("$.publish_time"),
|
||||||
|
link = Selector("$.url"),
|
||||||
),
|
),
|
||||||
next = Selector(".page > a.next", "href")
|
properties = mapOf(post),
|
||||||
|
next = Selector(
|
||||||
|
"$.data.news.total", process = Process(
|
||||||
|
script = listOf(
|
||||||
|
Script(
|
||||||
|
Script.Type.Javascript,
|
||||||
|
// language=javascript
|
||||||
|
"let url = params['url']\nif ((url && url !== '') && (text && text !== '')) {\n let total = parseInt(text)\n let postData = JSON.parse(params['json'])\n let current = parseInt(postData['page'] ?? '1')\n if (current * 20 >= total) {\n return ''\n }\n postData['page'] = current + 1\n return 'https://search.gd.gov.cn/api/search/all?json=' + encodeURIComponent(JSON.stringify(postData))\n}\nreturn ''"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
),
|
),
|
||||||
// language=regexp
|
// language=regexp
|
||||||
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule(
|
"https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
|
||||||
|
downloader = Downloader.Type.BROWSER,
|
||||||
|
text = Content(
|
||||||
|
expression = ".content-container",
|
||||||
|
title = Selector(".content-box .content h1.title"),
|
||||||
|
author = Selector("td.first:contains(发布机构) + td > span"),
|
||||||
|
dateTime = Selector("td.second:contains(成文日期) + td > span"),
|
||||||
|
content = Selector(".content .article-content"),
|
||||||
|
extra = mapOf(
|
||||||
|
"source" to Selector(".content .article-content", properties = mapOf(html))
|
||||||
|
),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
// language=regexp
|
||||||
|
"https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
|
||||||
text = Content(
|
text = Content(
|
||||||
expression = ".Con",
|
expression = ".Con",
|
||||||
title = Selector("h3.zw-title"),
|
title = Selector("h3.zw-title"),
|
||||||
@@ -119,16 +147,23 @@ private val 深圳市政务服务和数据管理局 = Site(
|
|||||||
target = Site.Target.TEXT,
|
target = Site.Target.TEXT,
|
||||||
downloader = Downloader.Type.HTTP,
|
downloader = Downloader.Type.HTTP,
|
||||||
properties = mapOf(iframe),
|
properties = mapOf(iframe),
|
||||||
tags = mapOf(
|
search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
|
||||||
"工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1",
|
|
||||||
),
|
|
||||||
rules = mapOf(
|
rules = mapOf(
|
||||||
// language=regexp
|
// language=regexp
|
||||||
"https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule(
|
"https*://search\\.gd\\.gov\\.cn/jsonp/site/755576\\?callback=getResult&page=\\d+&pagesize=20&text=.+" to Rule(
|
||||||
parser = Parser.Type.JSON,
|
parser = Parser.Type.JSON,
|
||||||
|
preload = Process(
|
||||||
|
script = listOf(
|
||||||
|
Script(
|
||||||
|
Script.Type.Javascript,
|
||||||
|
// language=javascript
|
||||||
|
"let getResult = result => result\nlet results = eval(text)\nreturn JSON.stringify(results)"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
),
|
||||||
list = Content(
|
list = Content(
|
||||||
expression = "$.articles",
|
expression = "$.results",
|
||||||
title = Selector("$.title"),
|
title = Selector("$.title", process = Process(remove = commonRemove)),
|
||||||
dateTime = Selector(
|
dateTime = Selector(
|
||||||
"$.first_publish_time",
|
"$.first_publish_time",
|
||||||
process = Process(
|
process = Process(
|
||||||
@@ -144,13 +179,13 @@ private val 深圳市政务服务和数据管理局 = Site(
|
|||||||
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
|
link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))),
|
||||||
),
|
),
|
||||||
next = Selector(
|
next = Selector(
|
||||||
"$.total",
|
"$.count",
|
||||||
process = Process(
|
process = Process(
|
||||||
script = listOf(
|
script = listOf(
|
||||||
Script(
|
Script(
|
||||||
Script.Type.Javascript,
|
Script.Type.Javascript,
|
||||||
// language=javascript
|
// language=javascript
|
||||||
script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n"
|
script = "let url = params['url']\nif (url && url !== '') {\n let current = parseInt(params['page'] ?? '1')\n let total = parseInt(text)\n if (current * 20 >= total) {\n return ''\n }\n return url.replace(/page=\\d+/, 'page=' + (current + 1))\n}\nreturn ''"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -183,14 +218,7 @@ private val 中华人民共和国中央人民政府 = Site(
|
|||||||
target = Site.Target.TEXT,
|
target = Site.Target.TEXT,
|
||||||
downloader = Downloader.Type.HTTP,
|
downloader = Downloader.Type.HTTP,
|
||||||
properties = mapOf(iframe),
|
properties = mapOf(iframe),
|
||||||
tags = mapOf(
|
search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
|
||||||
"搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D",
|
|
||||||
"搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D",
|
|
||||||
"搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D",
|
|
||||||
"搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D",
|
|
||||||
"搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D",
|
|
||||||
"搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D",
|
|
||||||
),
|
|
||||||
rules = mapOf(
|
rules = mapOf(
|
||||||
// language=regexp
|
// language=regexp
|
||||||
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
|
"https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
|
||||||
@@ -270,13 +298,7 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
|
|||||||
author = "lanyuanxiaoyao",
|
author = "lanyuanxiaoyao",
|
||||||
target = Site.Target.TEXT,
|
target = Site.Target.TEXT,
|
||||||
downloader = Downloader.Type.HTTP,
|
downloader = Downloader.Type.HTTP,
|
||||||
tags = mapOf(
|
search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
||||||
"搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
|
||||||
"搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
|
||||||
"搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
|
||||||
"搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
|
||||||
"搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
|
||||||
),
|
|
||||||
rules = mapOf(
|
rules = mapOf(
|
||||||
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
|
"https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
|
||||||
list = Content(
|
list = Content(
|
||||||
@@ -312,3 +334,12 @@ val sites = setOf(
|
|||||||
中华人民共和国中央人民政府,
|
中华人民共和国中央人民政府,
|
||||||
中华人民共和国国家互联网信息办公室,
|
中华人民共和国国家互联网信息办公室,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val keywords = setOf(
|
||||||
|
"数据要素",
|
||||||
|
"数据安全",
|
||||||
|
"数据跨境",
|
||||||
|
"数据交易",
|
||||||
|
"数据基础设施",
|
||||||
|
"国家数据局",
|
||||||
|
)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import cn.hutool.core.util.NumberUtil
|
|||||||
import cn.hutool.crypto.SecureUtil
|
import cn.hutool.crypto.SecureUtil
|
||||||
import com.lanyuanxiaoyao.digtal.market.Article
|
import com.lanyuanxiaoyao.digtal.market.Article
|
||||||
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
import com.lanyuanxiaoyao.digtal.market.ArticleRepository
|
||||||
|
import com.lanyuanxiaoyao.digtal.market.keywords
|
||||||
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
|
import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
|
||||||
import com.lanyuanxiaoyao.digtal.market.sites
|
import com.lanyuanxiaoyao.digtal.market.sites
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||||
@@ -18,6 +19,7 @@ import java.util.concurrent.atomic.AtomicLong
|
|||||||
import kotlin.time.Duration.Companion.seconds
|
import kotlin.time.Duration.Companion.seconds
|
||||||
import kotlin.time.toJavaDuration
|
import kotlin.time.toJavaDuration
|
||||||
import kotlinx.coroutines.runBlocking
|
import kotlinx.coroutines.runBlocking
|
||||||
|
import org.jsoup.Jsoup
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
import org.springframework.scheduling.annotation.Scheduled
|
import org.springframework.scheduling.annotation.Scheduled
|
||||||
import org.springframework.stereotype.Service
|
import org.springframework.stereotype.Service
|
||||||
@@ -30,6 +32,7 @@ class NewsRunner : Runner {
|
|||||||
.builder<Any>()
|
.builder<Any>()
|
||||||
.withDelay(10.seconds.toJavaDuration())
|
.withDelay(10.seconds.toJavaDuration())
|
||||||
.withMaxRetries(2)
|
.withMaxRetries(2)
|
||||||
|
.handleIf { e -> (e is PageParseException).not() }
|
||||||
.build()
|
.build()
|
||||||
|
|
||||||
@Resource
|
@Resource
|
||||||
@@ -45,8 +48,9 @@ class NewsRunner : Runner {
|
|||||||
override fun run() {
|
override fun run() {
|
||||||
sites.forEach { site ->
|
sites.forEach { site ->
|
||||||
logger.info("站点: {}", site.name)
|
logger.info("站点: {}", site.name)
|
||||||
site.tags.forEach { (tag, url) ->
|
keywords.forEach { keyword ->
|
||||||
logger.info("类目: {}, 地址: {}", tag, url)
|
val url = site.search.replace("{query}", keyword)
|
||||||
|
logger.info("类目: {}, 地址: {}", keyword, url)
|
||||||
val hashList = articleRepository.findAllId()
|
val hashList = articleRepository.findAllId()
|
||||||
val links = parseArticleLink(site.code, url, false)
|
val links = parseArticleLink(site.code, url, false)
|
||||||
val total = links.size
|
val total = links.size
|
||||||
@@ -72,12 +76,13 @@ class NewsRunner : Runner {
|
|||||||
article.score = triple?.third
|
article.score = triple?.third
|
||||||
}
|
}
|
||||||
|
|
||||||
article.category = tag
|
article.category = keyword
|
||||||
articleRepository.save(article)
|
articleRepository.save(article)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger.info("本轮采集完成")
|
||||||
}
|
}
|
||||||
|
|
||||||
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
|
fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
|
||||||
@@ -94,7 +99,7 @@ class NewsRunner : Runner {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
} catch (e: FailsafeException) {
|
} catch (e: FailsafeException) {
|
||||||
e.printStackTrace()
|
logger.error("Parse failure", e)
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
(page["text"] as? Map<*, *>)?.let { text ->
|
(page["text"] as? Map<*, *>)?.let { text ->
|
||||||
@@ -112,6 +117,16 @@ class NewsRunner : Runner {
|
|||||||
logger.error("Parse ${text["datetime"]} error", e)
|
logger.error("Parse ${text["datetime"]} error", e)
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
val source = text["source"]?.let {
|
||||||
|
val document = Jsoup.parse((it as String))
|
||||||
|
document
|
||||||
|
.select("script")
|
||||||
|
.forEach { node -> node.remove() }
|
||||||
|
document.forEachNode { node -> node.removeAttr("style") }
|
||||||
|
document
|
||||||
|
.body()
|
||||||
|
.html()
|
||||||
|
}
|
||||||
return Article(
|
return Article(
|
||||||
id = hash,
|
id = hash,
|
||||||
code = code,
|
code = code,
|
||||||
@@ -120,7 +135,7 @@ class NewsRunner : Runner {
|
|||||||
author = text["author"] as String?,
|
author = text["author"] as String?,
|
||||||
category = null,
|
category = null,
|
||||||
text = if ((text["content"] as String?) == null) null else text["content"] as String,
|
text = if ((text["content"] as String?) == null) null else text["content"] as String,
|
||||||
html = text["source"] as String?,
|
html = source,
|
||||||
subtitle = null,
|
subtitle = null,
|
||||||
description = null,
|
description = null,
|
||||||
score = null,
|
score = null,
|
||||||
@@ -143,7 +158,8 @@ class NewsRunner : Runner {
|
|||||||
.get(CheckedSupplier {
|
.get(CheckedSupplier {
|
||||||
runBlocking { management.parse(code, next!!) }
|
runBlocking { management.parse(code, next!!) }
|
||||||
})
|
})
|
||||||
} catch (e: PageParseException) {
|
} catch (e: FailsafeException) {
|
||||||
|
logger.error("Parse failure", e)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
|
@Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import cn.hutool.json.JSONUtil
|
|||||||
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
|
import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||||
import jakarta.annotation.Resource
|
import jakarta.annotation.Resource
|
||||||
|
import kotlinx.coroutines.runBlocking
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
import org.springframework.boot.test.context.SpringBootTest
|
import org.springframework.boot.test.context.SpringBootTest
|
||||||
@@ -22,23 +23,53 @@ class TestManagement {
|
|||||||
@Resource
|
@Resource
|
||||||
private lateinit var newsRunner: NewsRunner
|
private lateinit var newsRunner: NewsRunner
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun testParse() {
|
||||||
|
newsRunner
|
||||||
|
.parseArticleLink(
|
||||||
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
|
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||||
|
)
|
||||||
|
.forEach { link ->
|
||||||
|
val article = newsRunner.parseArticle(
|
||||||
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
|
link.url,
|
||||||
|
link.title,
|
||||||
|
link.datetime,
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun testDownload() {
|
||||||
|
val page = runBlocking {
|
||||||
|
management.download(
|
||||||
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
|
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
logger.info("Page: {}", page)
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
fun testParseList() {
|
fun testParseList() {
|
||||||
newsRunner
|
newsRunner
|
||||||
.parseArticleLink(
|
.parseArticleLink(
|
||||||
"1df28c35-1e9e-4d58-9595-f08029b160b4",
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
"https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
|
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
||||||
true,
|
true,
|
||||||
)
|
)
|
||||||
.forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) }
|
.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
fun testParseArticle() {
|
fun testParseArticle() {
|
||||||
newsRunner
|
newsRunner
|
||||||
.parseArticle(
|
.parseArticle(
|
||||||
"1df28c35-1e9e-4d58-9595-f08029b160b4",
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
"https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm",
|
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
|
||||||
"no title",
|
"no title",
|
||||||
"no datetime",
|
"no datetime",
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -48,3 +48,23 @@ Content-Type: application/json
|
|||||||
"你好,你能帮我做什么"
|
"你好,你能帮我做什么"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
### Search
|
||||||
|
GET https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=2&pagesize=20&text=数据要素&order=1&position=all
|
||||||
|
|
||||||
|
### Search
|
||||||
|
POST https://search.gd.gov.cn/api/search/all
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"keywords": "数据要素",
|
||||||
|
"advance": "true",
|
||||||
|
"sort": "time",
|
||||||
|
"position": "all",
|
||||||
|
"time_to": 2524579200,
|
||||||
|
"time_from": 189273600,
|
||||||
|
"site_id": "246",
|
||||||
|
"range": "site",
|
||||||
|
"recommand": 1
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user