feat: 增加北京政数局
This commit is contained in:
@@ -15,6 +15,7 @@ import com.lanyuanxiaoyao.squirrel.core.common.Site
|
|||||||
private val html = Pair("html", "true")
|
private val html = Pair("html", "true")
|
||||||
private val iframe = Pair("iframe", "true")
|
private val iframe = Pair("iframe", "true")
|
||||||
private val post = Pair("post", "true")
|
private val post = Pair("post", "true")
|
||||||
|
private val form = Pair("form", "true")
|
||||||
|
|
||||||
private val timeScript = Script(
|
private val timeScript = Script(
|
||||||
Script.Type.Javascript,
|
Script.Type.Javascript,
|
||||||
@@ -394,12 +395,72 @@ val 广州数据交易所 = Site(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val 北京市政务服务和数据管理局 = Site(
|
||||||
|
code = "cee7f242-668b-41fb-adbc-96fb27d4bf35",
|
||||||
|
name = "北京市政务服务和数据管理局",
|
||||||
|
author = "lanyuanxiaoyao",
|
||||||
|
home = "https://zwfwj.beijing.gov.cn",
|
||||||
|
icon = "https://zwfwj.beijing.gov.cn/favicon.ico",
|
||||||
|
target = Site.Target.SEARCH,
|
||||||
|
parser = Parser.Type.JSON,
|
||||||
|
search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20",
|
||||||
|
rules = mapOf(
|
||||||
|
// language=regexp
|
||||||
|
"https://zwfwj\\.beijing\\.gov\\.cn/so/ss/query/s.+" to Rule(
|
||||||
|
properties = mapOf(form),
|
||||||
|
list = Content(
|
||||||
|
expression = "$.resultDocs",
|
||||||
|
title = Selector("$.data.title", process = Process(remove = commonRemove)),
|
||||||
|
dateTime = Selector(
|
||||||
|
"$.timestamp",
|
||||||
|
process = Process(
|
||||||
|
script = listOf(
|
||||||
|
Script(
|
||||||
|
Script.Type.Javascript,
|
||||||
|
// language=javascript
|
||||||
|
"if (text && text !== '') {\n let datetime = new Date(parseInt(text))\n return `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`\n}\nreturn ''",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
link = Selector("$.data.url")
|
||||||
|
),
|
||||||
|
next = Selector("$.totalHits", process = Process(script = listOf(
|
||||||
|
Script(
|
||||||
|
Script.Type.Javascript,
|
||||||
|
// language=javascript
|
||||||
|
"if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}"
|
||||||
|
))
|
||||||
|
))
|
||||||
|
),
|
||||||
|
// language=regexp
|
||||||
|
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
|
||||||
|
parser = Parser.Type.CSS,
|
||||||
|
downloader = Downloader.Type.BROWSER,
|
||||||
|
text = Content(
|
||||||
|
expression = "#main .details_page",
|
||||||
|
title = Selector("h1"),
|
||||||
|
author = Selector(".article-info .ly", process = Process(remove = listOf("来源:"))),
|
||||||
|
dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间:"))),
|
||||||
|
content = Selector("#div_zhengwen")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
val sites = setOf(
|
val sites = setOf(
|
||||||
广东政务服务和数据管理局,
|
// 国家级
|
||||||
深圳市政务服务和数据管理局,
|
|
||||||
中华人民共和国中央人民政府,
|
中华人民共和国中央人民政府,
|
||||||
中华人民共和国国家互联网信息办公室,
|
中华人民共和国国家互联网信息办公室,
|
||||||
|
// 省级
|
||||||
|
广东政务服务和数据管理局,
|
||||||
|
// 地市级
|
||||||
|
// 北京
|
||||||
|
北京市政务服务和数据管理局,
|
||||||
|
// 广州
|
||||||
广州数据交易所,
|
广州数据交易所,
|
||||||
|
// 深圳
|
||||||
|
深圳市政务服务和数据管理局,
|
||||||
)
|
)
|
||||||
|
|
||||||
val keywords = setOf(
|
val keywords = setOf(
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat
|
|||||||
import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
|
import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Script
|
import com.lanyuanxiaoyao.squirrel.core.common.Script
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.ScriptExecutor
|
import com.lanyuanxiaoyao.squirrel.core.common.ScriptExecutor
|
||||||
|
import com.lanyuanxiaoyao.squirrel.core.common.parseQueryParams
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH
|
import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.BasicDownloaderFactory
|
import com.lanyuanxiaoyao.squirrel.core.jvm.BasicDownloaderFactory
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.ChromeDownloaderFactory
|
import com.lanyuanxiaoyao.squirrel.core.jvm.ChromeDownloaderFactory
|
||||||
@@ -20,10 +21,7 @@ import org.junit.jupiter.api.Test
|
|||||||
class Test {
|
class Test {
|
||||||
@Test
|
@Test
|
||||||
fun testTimestamp() {
|
fun testTimestamp() {
|
||||||
println(
|
println("http://www.gov.cn/xinwen/2023-02/16/content_5741672.htm?hello=&hello=world&go=&ni=wps".parseQueryParams())
|
||||||
DateUtil.parse("2018-08-08", "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd")
|
|
||||||
.toTimestamp()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
@@ -56,11 +56,12 @@ class TestManagement {
|
|||||||
fun testParseList() {
|
fun testParseList() {
|
||||||
newsRunner
|
newsRunner
|
||||||
.parseArticleLink(
|
.parseArticleLink(
|
||||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
"cee7f242-668b-41fb-adbc-96fb27d4bf35",
|
||||||
"https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
|
"https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
|
||||||
true,
|
true,
|
||||||
)
|
)
|
||||||
.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
// .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
|
||||||
|
.let { it.forEach { logger.info("{}", it.url) } }
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -68,7 +69,7 @@ class TestManagement {
|
|||||||
newsRunner
|
newsRunner
|
||||||
.parseArticle(
|
.parseArticle(
|
||||||
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
|
||||||
"https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589",
|
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html",
|
||||||
"no title",
|
"no title",
|
||||||
"no datetime",
|
"no datetime",
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -1,15 +1,8 @@
|
|||||||
package com.lanyuanxiaoyao.digtal.market
|
package com.lanyuanxiaoyao.digtal.market
|
||||||
|
|
||||||
import cn.hutool.json.JSONUtil
|
import cn.hutool.json.JSONUtil
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Content
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.InMemoryDatabase
|
import com.lanyuanxiaoyao.squirrel.core.common.InMemoryDatabase
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
import com.lanyuanxiaoyao.squirrel.core.common.Management
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Parser
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Process
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Rule
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Script
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Selector
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.common.Site
|
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH
|
import com.lanyuanxiaoyao.squirrel.core.jvm.BINARY_PATH
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.DRIVER_PATH
|
import com.lanyuanxiaoyao.squirrel.core.jvm.DRIVER_PATH
|
||||||
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
|
import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
|
||||||
@@ -21,72 +14,8 @@ class TestRule {
|
|||||||
private lateinit var management: Management
|
private lateinit var management: Management
|
||||||
|
|
||||||
private val link =
|
private val link =
|
||||||
"https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%223010%22%7D"
|
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html"
|
||||||
private val site = Site(
|
private val site = 北京市政务服务和数据管理局
|
||||||
code = "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
|
|
||||||
name = "广州数据交易所",
|
|
||||||
author = "lanyuanxiaoyao",
|
|
||||||
home = "https://www.cantonde.com",
|
|
||||||
icon = "https://www.cantonde.com/favicon.ico",
|
|
||||||
description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。",
|
|
||||||
target = Site.Target.SEARCH,
|
|
||||||
parser = Parser.Type.JSON,
|
|
||||||
search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
|
|
||||||
rules = mapOf(
|
|
||||||
// language=regexp
|
|
||||||
"https://www\\.cantonde\\.com/si/common/searchInfo\\?json=.+" to Rule(
|
|
||||||
list = Content(
|
|
||||||
expression = "$.data",
|
|
||||||
title = Selector("$.TITLE"),
|
|
||||||
dateTime = Selector("$.PUBTIME_ORDER"),
|
|
||||||
link = Selector(
|
|
||||||
"$.ID", process = Process(
|
|
||||||
script = listOf(
|
|
||||||
Script(
|
|
||||||
Script.Type.Javascript,
|
|
||||||
// language=javascript
|
|
||||||
"return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
properties = mapOf("post" to "true"),
|
|
||||||
next = Selector(
|
|
||||||
"$.extra.total", process = Process(
|
|
||||||
script = listOf(
|
|
||||||
Script(
|
|
||||||
Script.Type.Javascript,
|
|
||||||
// language=javascript
|
|
||||||
"if (text && text !== '') {\n let total = parseInt(text)\n let postData = JSON.parse(params['json'])\n let current = postData['pageNo']\n let size = postData['pageSize']\n if (current * size >= total) {\n return ''\n }\n postData['pageNo'] = current + 1\n return `https://www.cantonde.com/si/common/searchInfo?json=` + encodeURIComponent(JSON.stringify(postData))\n}\nreturn ''"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
),
|
|
||||||
// language=regexp
|
|
||||||
"https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule(
|
|
||||||
text = Content(
|
|
||||||
expression = "$.data",
|
|
||||||
title = Selector("$.TITLE"),
|
|
||||||
content = Selector("$.CONTENT"),
|
|
||||||
dateTime = Selector(
|
|
||||||
process = Process(
|
|
||||||
script = listOf(
|
|
||||||
Script(
|
|
||||||
Script.Type.Javascript,
|
|
||||||
// language=javascript
|
|
||||||
"return params['datetime'] ?? ''",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
author = Selector(process = Process(default = "广州数据交易所"))
|
|
||||||
),
|
|
||||||
properties = mapOf("post" to "true"),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
@BeforeTest
|
@BeforeTest
|
||||||
fun before() {
|
fun before() {
|
||||||
@@ -112,4 +41,15 @@ class TestRule {
|
|||||||
}
|
}
|
||||||
println(JSONUtil.toJsonPrettyStr(page))
|
println(JSONUtil.toJsonPrettyStr(page))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun testDownload() {
|
||||||
|
val page = runBlocking {
|
||||||
|
management.download(
|
||||||
|
code = site.code,
|
||||||
|
url = link,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
println(page)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -70,14 +70,4 @@ Content-Type: application/json
|
|||||||
}
|
}
|
||||||
|
|
||||||
### Search
|
### Search
|
||||||
POST https://www.cantonde.com/si/common/searchInfo
|
POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"NAME": "数据要素",
|
|
||||||
"IN_CATEGORY": "",
|
|
||||||
"NOT_IN_CATEGORY": "",
|
|
||||||
"CATEGORY": "",
|
|
||||||
"pageNo": 1,
|
|
||||||
"pageSize": 10
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user