diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt index 2c53567..2211987 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt @@ -14,6 +14,7 @@ import com.lanyuanxiaoyao.squirrel.core.common.Site private val html = Pair("html", "true") private val iframe = Pair("iframe", "true") +private val post = Pair("post", "true") private val timeScript = Script( Script.Type.Javascript, @@ -27,35 +28,62 @@ private val titleScript = Script( script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}", ) +// language=regexp +private val commonRemove = listOf( + "<.+?>", + "&.+?;" +) + private val 广东政务服务和数据管理局 = Site( code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", name = "广东政务服务和数据管理局", home = "https://zfsg.gd.gov.cn", parser = Parser.Type.CSS, author = "lanyuanxiaoyao", - target = Site.Target.TEXT, + target = Site.Target.SEARCH, downloader = Downloader.Type.BROWSER, - tags = mapOf( - "数据要闻" to "https://zfsg.gd.gov.cn/xxfb/ywsd/index.html", - "省局要闻" to "https://zfsg.gd.gov.cn/xxfb/sjyw/index.html", - "动态新闻" to "https://zfsg.gd.gov.cn/xxfb/dtxw/index.html", - "媒体报道" to "https://zfsg.gd.gov.cn/xxfb/mtbd/index.html", - "政务文件" to "https://zfsg.gd.gov.cn/zwgk/wjk/index.html", - "政策解读" to "https://zfsg.gd.gov.cn/zwgk/zcjd2/index.html", - ), + search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D", rules = mapOf( // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/index(_\\d*)*\\.html" to Rule( + "https://search.gd.gov.cn/api/search/all.*" to Rule( + downloader = Downloader.Type.HTTP, + parser = Parser.Type.JSON, list = Content( - expression = "ul.newList > li", - title = Selector(".til > a"), - dateTime = Selector(".time"), - link = Selector(".til > a", "href"), + expression = "$.data.news.list", + title = Selector("$.title", process = Process(remove = commonRemove)), + author = Selector("$.source"), + dateTime = Selector("$.publish_time"), + link = Selector("$.url"), ), - next = Selector(".page > a.next", "href") + properties = mapOf(post), + next = Selector( + "$.data.news.total", process = Process( + script = listOf( + Script( + Script.Type.Javascript, + // language=javascript + "let url = params['url']\nif ((url && url !== '') && (text && text !== '')) {\n let total = parseInt(text)\n let postData = JSON.parse(params['json'])\n let current = parseInt(postData['page'] ?? '1')\n if (current * 20 >= total) {\n return ''\n }\n postData['page'] = current + 1\n return 'https://search.gd.gov.cn/api/search/all?json=' + encodeURIComponent(JSON.stringify(postData))\n}\nreturn ''" + ) + ) + ) + ) ), // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk)/.+/content/post_\\d+\\.html" to Rule( + "https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule( + downloader = Downloader.Type.BROWSER, + text = Content( + expression = ".content-container", + title = Selector(".content-box .content h1.title"), + author = Selector("td.first:contains(发布机构) + td > span"), + dateTime = Selector("td.second:contains(成文日期) + td > span"), + content = Selector(".content .article-content"), + extra = mapOf( + "source" to Selector(".content .article-content", properties = mapOf(html)) + ), + ) + ), + // language=regexp + "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule( text = Content( expression = ".Con", title = Selector("h3.zw-title"), @@ -119,16 +147,23 @@ private val 深圳市政务服务和数据管理局 = Site( target = Site.Target.TEXT, downloader = Downloader.Type.HTTP, properties = mapOf(iframe), - tags = mapOf( - "工作动态" to "http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page=1", - ), + search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all", rules = mapOf( // language=regexp - "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/api/all/19236\\?page=\\d+" to Rule( + "https*://search\\.gd\\.gov\\.cn/jsonp/site/755576\\?callback=getResult&page=\\d+&pagesize=20&text=.+" to Rule( parser = Parser.Type.JSON, + preload = Process( + script = listOf( + Script( + Script.Type.Javascript, + // language=javascript + "let getResult = result => result\nlet results = eval(text)\nreturn JSON.stringify(results)" + ) + ) + ), list = Content( - expression = "$.articles", - title = Selector("$.title"), + expression = "$.results", + title = Selector("$.title", process = Process(remove = commonRemove)), dateTime = Selector( "$.first_publish_time", process = Process( @@ -144,13 +179,13 @@ private val 深圳市政务服务和数据管理局 = Site( link = Selector("$.url", process = Process(replace = listOf(Replace("https", "http")))), ), next = Selector( - "$.total", + "$.count", process = Process( script = listOf( Script( Script.Type.Javascript, // language=javascript - script = "let total = parseInt(text)\nlet base = 'http://www.sz.gov.cn/szzsj/gkmlpt/api/all/19236?page='\nlet count = parseInt(params['page'] ?? '1')\nif (count * 100 >= total) {\n return ''\n}\nreturn `\${base}\${count + 1}`\n" + script = "let url = params['url']\nif (url && url !== '') {\n let current = parseInt(params['page'] ?? '1')\n let total = parseInt(text)\n if (current * 20 >= total) {\n return ''\n }\n return url.replace(/page=\\d+/, 'page=' + (current + 1))\n}\nreturn ''" ) ) ) @@ -183,14 +218,7 @@ private val 中华人民共和国中央人民政府 = Site( target = Site.Target.TEXT, downloader = Downloader.Type.HTTP, properties = mapOf(iframe), - tags = mapOf( - "搜索-数据要素" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%A6%81%E7%B4%A0%22%7D", - "搜索-国家数据局" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%9B%BD%E5%AE%B6%E6%95%B0%E6%8D%AE%E5%B1%80%22%7D", - "搜索-可信数据空间" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E5%8F%AF%E4%BF%A1%E6%95%B0%E6%8D%AE%E7%A9%BA%E9%97%B4%22%7D", - "搜索-数据基础设施" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%22%7D", - "搜索-数据跨境" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E8%B7%A8%E5%A2%83%22%7D", - "搜索-数据安全" to "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22%E6%95%B0%E6%8D%AE%E5%AE%89%E5%85%A8%22%7D", - ), + search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D", rules = mapOf( // language=regexp "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( @@ -270,13 +298,7 @@ private val 中华人民共和国国家互联网信息办公室 = Site( author = "lanyuanxiaoyao", target = Site.Target.TEXT, downloader = Downloader.Type.HTTP, - tags = mapOf( - "搜索-数据要素" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", - "搜索-数据跨境" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", - "搜索-数据基础设施" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", - "搜索-数据安全" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", - "搜索-数据交易" to "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据跨境&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", - ), + search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", rules = mapOf( "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule( list = Content( @@ -312,3 +334,12 @@ val sites = setOf( 中华人民共和国中央人民政府, 中华人民共和国国家互联网信息办公室, ) + +val keywords = setOf( + "数据要素", + "数据安全", + "数据跨境", + "数据交易", + "数据基础设施", + "国家数据局", +) diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt index 4f28747..e98988c 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt @@ -5,6 +5,7 @@ import cn.hutool.core.util.NumberUtil import cn.hutool.crypto.SecureUtil import com.lanyuanxiaoyao.digtal.market.Article import com.lanyuanxiaoyao.digtal.market.ArticleRepository +import com.lanyuanxiaoyao.digtal.market.keywords import com.lanyuanxiaoyao.digtal.market.service.DescriptionService import com.lanyuanxiaoyao.digtal.market.sites import com.lanyuanxiaoyao.squirrel.core.common.Management @@ -18,6 +19,7 @@ import java.util.concurrent.atomic.AtomicLong import kotlin.time.Duration.Companion.seconds import kotlin.time.toJavaDuration import kotlinx.coroutines.runBlocking +import org.jsoup.Jsoup import org.slf4j.LoggerFactory import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Service @@ -30,6 +32,7 @@ class NewsRunner : Runner { .builder() .withDelay(10.seconds.toJavaDuration()) .withMaxRetries(2) + .handleIf { e -> (e is PageParseException).not() } .build() @Resource @@ -45,8 +48,9 @@ class NewsRunner : Runner { override fun run() { sites.forEach { site -> logger.info("站点: {}", site.name) - site.tags.forEach { (tag, url) -> - logger.info("类目: {}, 地址: {}", tag, url) + keywords.forEach { keyword -> + val url = site.search.replace("{query}", keyword) + logger.info("类目: {}, 地址: {}", keyword, url) val hashList = articleRepository.findAllId() val links = parseArticleLink(site.code, url, false) val total = links.size @@ -72,12 +76,13 @@ class NewsRunner : Runner { article.score = triple?.third } - article.category = tag + article.category = keyword articleRepository.save(article) } } } } + logger.info("本轮采集完成") } fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? { @@ -94,7 +99,7 @@ class NewsRunner : Runner { } }) } catch (e: FailsafeException) { - e.printStackTrace() + logger.error("Parse failure", e) return null } (page["text"] as? Map<*, *>)?.let { text -> @@ -112,6 +117,16 @@ class NewsRunner : Runner { logger.error("Parse ${text["datetime"]} error", e) null } + val source = text["source"]?.let { + val document = Jsoup.parse((it as String)) + document + .select("script") + .forEach { node -> node.remove() } + document.forEachNode { node -> node.removeAttr("style") } + document + .body() + .html() + } return Article( id = hash, code = code, @@ -120,7 +135,7 @@ class NewsRunner : Runner { author = text["author"] as String?, category = null, text = if ((text["content"] as String?) == null) null else text["content"] as String, - html = text["source"] as String?, + html = source, subtitle = null, description = null, score = null, @@ -143,7 +158,8 @@ class NewsRunner : Runner { .get(CheckedSupplier { runBlocking { management.parse(code, next!!) } }) - } catch (e: PageParseException) { + } catch (e: FailsafeException) { + logger.error("Parse failure", e) continue } @Suppress("UNCHECKED_CAST") (page["list"] as? List>)?.let { list -> diff --git a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt index 6ecd8fb..2318e6e 100644 --- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt +++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt @@ -4,6 +4,7 @@ import cn.hutool.json.JSONUtil import com.lanyuanxiaoyao.digtal.market.runner.NewsRunner import com.lanyuanxiaoyao.squirrel.core.common.Management import jakarta.annotation.Resource +import kotlinx.coroutines.runBlocking import org.junit.jupiter.api.Test import org.slf4j.LoggerFactory import org.springframework.boot.test.context.SpringBootTest @@ -22,23 +23,53 @@ class TestManagement { @Resource private lateinit var newsRunner: NewsRunner + @Test + fun testParse() { + newsRunner + .parseArticleLink( + "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D", + ) + .forEach { link -> + val article = newsRunner.parseArticle( + "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + link.url, + link.title, + link.datetime, + "", + ) + logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text) + } + } + + @Test + fun testDownload() { + val page = runBlocking { + management.download( + "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + "https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589", + ) + } + logger.info("Page: {}", page) + } + @Test fun testParseList() { newsRunner .parseArticleLink( - "1df28c35-1e9e-4d58-9595-f08029b160b4", - "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro=数据要素&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", + "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22数据要素%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D", true, ) - .forEach { logger.info("{} {} {}", it.datetime, it.url, it.title) } + .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } } @Test fun testParseArticle() { newsRunner .parseArticle( - "1df28c35-1e9e-4d58-9595-f08029b160b4", - "https://www.cac.gov.cn/2024-10/14/c_1730595202555062.htm", + "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", + "https://zfsg.gd.gov.cn/gkmlpt/content/4/4514/post_4514242.html#2589", "no title", "no datetime", "", diff --git a/src/test/resources/test.http b/src/test/resources/test.http index 51b5727..1de8e91 100644 --- a/src/test/resources/test.http +++ b/src/test/resources/test.http @@ -48,3 +48,23 @@ Content-Type: application/json "你好,你能帮我做什么" ] } + +### Search +GET https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=2&pagesize=20&text=数据要素&order=1&position=all + +### Search +POST https://search.gd.gov.cn/api/search/all +Content-Type: application/json + +{ + "page": 1, + "keywords": "数据要素", + "advance": "true", + "sort": "time", + "position": "all", + "time_to": 2524579200, + "time_from": 189273600, + "site_id": "246", + "range": "site", + "recommand": 1 +} \ No newline at end of file