diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt index d5d635d..b3c454f 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt @@ -24,7 +24,7 @@ class Article( @Column(columnDefinition = "longtext") var html: String?, var subtitle: String?, @Column(columnDefinition = "longtext") var description: String?, - var score: Int?, + var tags: String?, var createTime: Date?, var pushed: Boolean?, ) @@ -41,7 +41,7 @@ interface ArticleRepository : JpaRepository, JpaSpecificationEx @Query("update Article article set article.pushed = :pushed where article.id = :id") fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean) - @Query("select article.id from Article article where article.description is not null and article.subtitle is not null and article.score is not null") + @Query("select article.id from Article article where article.description is not null and article.text is not null and article.text <> ''") fun findAllId(): List @Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code") diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt new file mode 100644 index 0000000..b0be2f3 --- /dev/null +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt @@ -0,0 +1,153 @@ +package com.lanyuanxiaoyao.digtal.market + +import cn.hutool.core.date.DateUtil +import cn.hutool.crypto.SecureUtil +import com.lanyuanxiaoyao.squirrel.core.common.Management +import com.lanyuanxiaoyao.squirrel.core.common.PageParseException +import dev.failsafe.Failsafe +import dev.failsafe.FailsafeException +import dev.failsafe.RetryPolicy +import dev.failsafe.function.CheckedSupplier +import kotlin.time.Duration.Companion.seconds +import kotlin.time.toJavaDuration +import kotlinx.coroutines.runBlocking +import org.jsoup.Jsoup +import org.slf4j.LoggerFactory + +data class ArticleLink( + val code: String, + val url: String, + val hash: String, + val title: String?, + val datetime: String?, +) + +private val logger = LoggerFactory.getLogger("Helper") + +private val retryPolicy = RetryPolicy + .builder() + .withDelay(10.seconds.toJavaDuration()) + .withMaxRetries(2) + .handleIf { e -> (e is PageParseException).not() } + .build() + +@Suppress("UNCHECKED_CAST") +fun parseArticleLink(management: Management, code: String, url: String?, recursive: Boolean = false): List { + var next: String? = url + val links = mutableListOf() + do { + val page = try { + Failsafe + .with(retryPolicy) + .get(CheckedSupplier { + runBlocking { management.parse(code, next!!) } + }) + } catch (e: FailsafeException) { + logger.error("解析失败:${next}", e) + continue + } + (page["list"] as? List>) + ?.let { list -> + for (item in list) { + val link = item["link"] as String? + if (link.isNullOrBlank()) { + logger.warn("链接为空:{} {}", item["title"] as String?, next) + } else { + links.add( + ArticleLink( + code, + link, + SecureUtil.md5(link), + item["title"] as String?, + item["datetime"] as String?, + ) + ) + } + } + } + next = page["next"] as String? + } while (recursive && !next.isNullOrBlank()) + return links +} + +fun parseArticle(management: Management, articleLink: ArticleLink): Article? { + val (code, url, hash, title, datetime) = articleLink + return parseArticle(management, code, url, hash, title, datetime) +} + +fun parseArticle(management: Management, code: String, url: String, hash: String, title: String?, datetime: String?): Article? { + logger.info("标题:{} 时间:{}", title, datetime) + val page = try { + Failsafe + .with(retryPolicy) + .get(CheckedSupplier { + runBlocking { management.parse(code, url) } + }) + } catch (e: FailsafeException) { + logger.error("解析失败:${url}", e) + return null + } + (page["text"] as? Map<*, *>)?.let { text -> + var datetimeText = text["datetime"] as? String + val createTime = try { + if (datetimeText.isNullOrBlank()) { + datetimeText = datetime + } + if (datetimeText?.matches(Regex("\\d{13}")) == true) { + DateUtil + .date(datetimeText.toLong()) + .toTimestamp() + } else if (datetimeText?.matches(Regex("\\d{10}")) == true) { + DateUtil + .date(datetimeText.toLong() * 1000) + .toTimestamp() + } else { + DateUtil + .parse( + datetimeText, + "yyyy-MM-dd HH:mm:ss", + "yyyy-MM-dd", + "yyyy年MM月dd日 HH:mm", + "yyyy-MM-dd HH:mm", + ) + .toTimestamp() + } + } catch (e: Exception) { + logger.error("解析 $datetimeText 失败", e) + null + } + val sourceText = (text["source"] as? String) ?: "" + val source = if (sourceText.isBlank()) { + sourceText + } else { + sourceText.let { + val document = Jsoup.parse((it as String)) + document + .select("script") + .forEach { it.remove() } + document + .select("link") + .forEach { it.remove() } + document.forEachNode { node -> node.removeAttr("style") } + document + .body() + .html() + } + } + return Article( + id = hash, + code = code, + url = url, + title = null, + author = null, + category = null, + text = text["content"] as String? ?: "", + html = source, + subtitle = null, + description = null, + tags = null, + createTime = createTime, + pushed = false, + ) + } ?: return null +} \ No newline at end of file diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt index 05e8d71..e289403 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt @@ -11,11 +11,11 @@ import com.lanyuanxiaoyao.squirrel.core.common.Rule import com.lanyuanxiaoyao.squirrel.core.common.Script import com.lanyuanxiaoyao.squirrel.core.common.Selector import com.lanyuanxiaoyao.squirrel.core.common.Site +import com.lanyuanxiaoyao.squirrel.core.common.Site.Option.Companion.OPEN_WITH_IFRAME -private val html = Pair("html", "true") -private val iframe = Pair("iframe", "true") -private val post = Pair("post", "true") -private val form = Pair("form", "true") +private val html = "html" to "true" +private val post = "post" to "true" +private val form = "form" to "true" // language=regexp private val commonRemove = listOf( @@ -23,19 +23,29 @@ private val commonRemove = listOf( "&.+?;" ) +private val sourceRule = + // language=regexp + "https*://.+" to Rule( + downloader = Downloader.Type.BROWSER, + parser = Parser.Type.CSS, + text = Content( + expression = "body", + content = Selector(":root"), + extra = mapOf( + "source" to Selector(":root", properties = mapOf(html)) + ), + ) + ) + private val 广东政务服务和数据管理局 = Site( code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e", name = "广东政务服务和数据管理局", home = "https://zfsg.gd.gov.cn", - parser = Parser.Type.CSS, author = "lanyuanxiaoyao", - target = Site.Target.SEARCH, - downloader = Downloader.Type.BROWSER, search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D", rules = mapOf( // language=regexp "https://search.gd.gov.cn/api/search/all.*" to Rule( - downloader = Downloader.Type.HTTP, parser = Parser.Type.JSON, list = Content( expression = "$.data.news.list", @@ -57,71 +67,7 @@ private val 广东政务服务和数据管理局 = Site( ) ) ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".content-container", - title = Selector(".content-box .content h1.title"), - author = Selector("td.first:contains(发布机构) + td > span"), - dateTime = Selector("td.second:contains(成文日期) + td > span"), - content = Selector(".content .article-content"), - extra = mapOf( - "source" to Selector(".content .article-content", properties = mapOf(html)) - ), - ) - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule( - text = Content( - expression = ".Con", - title = Selector("h3.zw-title"), - author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))), - dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))), - content = Selector(".zw"), - extra = mapOf( - "source" to Selector(".zw", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https://mp\\.weixin\\.qq\\.com/s/.+" to Rule( - text = Content( - expression = "#page-content", - title = Selector("#activity-name"), - author = Selector("#js_name"), - dateTime = Selector("#publish_time"), - content = Selector("#js_content"), - extra = mapOf( - "source" to Selector("#js_content", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule( - text = Content( - expression = "#article-container", - title = Selector("#article-title"), - author = Selector("#article-source", process = Process(remove = listOf("发布机构:"))), - dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间:"))), - content = Selector("#article-content"), - extra = mapOf( - "source" to Selector("#article-content", properties = mapOf(html)) - ) - ) - ), - // language=regexp - "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule( - text = Content( - expression = "body", - title = Selector(".title-page .txt > span"), - author = Selector(process = Process(default = "广东政务服务和数据管理局")), - content = Selector(".content"), - extra = mapOf( - "source" to Selector(".content", properties = mapOf(html)) - ) - ) - ), + sourceRule, ) ) @@ -130,11 +76,8 @@ private val 深圳市政务服务和数据管理局 = Site( name = "深圳市政务服务和数据管理局", home = "https://www.sz.gov.cn/szzsj/gkmlpt/index", icon = "https://www.sz.gov.cn/favicon.ico", - parser = Parser.Type.CSS, author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.HTTP, - properties = mapOf(iframe), + options = listOf(OPEN_WITH_IFRAME), search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all", rules = mapOf( // language=regexp @@ -179,20 +122,7 @@ private val 深圳市政务服务和数据管理局 = Site( ) ) ), - // language=regexp - "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".content-container", - title = Selector(".content-box .content h1.title"), - author = Selector("td.first:contains(发布机构) + td > span"), - dateTime = Selector("td.second:contains(成文日期) + td > span"), - content = Selector(".content .article-content"), - extra = mapOf( - "source" to Selector(".content .article-content", properties = mapOf(html)) - ), - ) - ) + sourceRule, ) ) @@ -201,12 +131,9 @@ private val 中华人民共和国中央人民政府 = Site( name = "中华人民共和国中央人民政府", home = "https://www.gov.cn", icon = "https://www.gov.cn/favicon.ico", - parser = Parser.Type.CSS, author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.HTTP, - properties = mapOf(iframe), search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D", + options = listOf(OPEN_WITH_IFRAME), rules = mapOf( // language=regexp "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule( @@ -216,10 +143,10 @@ private val 中华人民共和国中央人民政府 = Site( "Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D", "Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2", ), - properties = mapOf("post" to "true"), + properties = mapOf(post), list = Content( expression = "$.result.data.middle.list", - title = Selector("$.title_no_tag"), + title = Selector("$.title_no_tag", process = Process(remove = commonRemove)), dateTime = Selector("$.time"), link = Selector("$.url"), ), @@ -236,51 +163,7 @@ private val 中华人民共和国中央人民政府 = Site( ) ) ), - // language=regexp - "https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".policyLibraryOverview_content", - author = Selector("td:contains(源:) + td"), - content = Selector(".pages_content"), - extra = mapOf( - "source" to Selector(".pages_content", properties = mapOf("html" to "true")) - ) - ) - ), - // language=regexp - "https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".content .article", - title = Selector("h1#ti"), - author = Selector( - ".pages-date > .font", - process = Process( - default = "中华人民共和国中央人民政府", - remove = listOf("来源:") - ) - ), - dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")), - content = Selector(".pages_content"), - extra = mapOf( - "source" to Selector(".pages_content", properties = mapOf("html" to "true")) - ) - ) - ), - // language=regexp - "https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule( - downloader = Downloader.Type.BROWSER, - text = Content( - expression = ".main-content", - title = Selector(".qa_content_box"), - author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))), - content = Selector(".qa_content_text"), - extra = mapOf( - "source" to Selector(".main-content", properties = mapOf("html" to "true")) - ) - ) - ), + sourceRule, ) ) @@ -289,12 +172,10 @@ private val 中华人民共和国国家互联网信息办公室 = Site( name = "中华人民共和国国家互联网信息办公室", home = "https://www.cac.gov.cn", icon = "https://www.cac.gov.cn/favicon.ico", - parser = Parser.Type.CSS, author = "lanyuanxiaoyao", - target = Site.Target.TEXT, - downloader = Downloader.Type.HTTP, search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=¬pro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09", rules = mapOf( + // language=regexp "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=¬pro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule( list = Content( expression = ".xpage-container .list-item", @@ -302,24 +183,9 @@ private val 中华人民共和国国家互联网信息办公室 = Site( dateTime = Selector(".search_time"), link = Selector("a", "href", process = Process(prefix = "https:")) ), - next = Selector( - ".xpage-pagination .xpage-pagination-next a:contains(下一页)", - "href", - Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/") - ) + next = Selector(".xpage-pagination .xpage-pagination-next a:contains(下一页)", "href", Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")) ), - "https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule( - text = Content( - expression = ".main", - title = Selector("h1.title"), - author = Selector("#source", process = Process(remove = listOf("来源:"))), - dateTime = Selector("#pubtime"), - content = Selector(".main-content"), - extra = mapOf( - "source" to Selector(".main", properties = mapOf("html" to "true")) - ) - ) - ) + sourceRule, ) ) @@ -330,7 +196,6 @@ val 广州数据交易所 = Site( home = "https://www.cantonde.com", icon = "https://www.cantonde.com/favicon.ico", description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场,助力数字经济高质量发展工作部署,高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽,是畅通数据要素大循环的关键举措,也是推进数据要素市场化配置改革的重要载体。", - target = Site.Target.SEARCH, parser = Parser.Type.JSON, search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D", rules = mapOf( @@ -346,7 +211,7 @@ val 广州数据交易所 = Site( Script( Script.Type.Javascript, // language=javascript - "return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`", + "return `https://www.cantonde.com/info.html#/infoDetail?id=\${text}`", ) ) ) @@ -365,26 +230,16 @@ val 广州数据交易所 = Site( ) ), ), - // language=regexp - "https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule( - text = Content( - expression = "$.data", - title = Selector("$.TITLE"), - content = Selector("$.CONTENT", process = Process(remove = commonRemove)), - author = Selector(process = Process(default = "广州数据交易所")) - ), - properties = mapOf(post), - ), + sourceRule, ), ) -val 北京市政务服务和数据管理局 = Site( +private val 北京市政务服务和数据管理局 = Site( code = "cee7f242-668b-41fb-adbc-96fb27d4bf35", name = "北京市政务服务和数据管理局", author = "lanyuanxiaoyao", home = "https://zwfwj.beijing.gov.cn", icon = "https://zwfwj.beijing.gov.cn/favicon.ico", - target = Site.Target.SEARCH, parser = Parser.Type.JSON, search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20", rules = mapOf( @@ -420,18 +275,7 @@ val 北京市政务服务和数据管理局 = Site( ) ) ), - // language=regexp - "https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule( - parser = Parser.Type.CSS, - downloader = Downloader.Type.BROWSER, - text = Content( - expression = "#main .details_page", - title = Selector("h1"), - author = Selector(".article-info .ly", process = Process(remove = listOf("来源:"))), - dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间:"))), - content = Selector("#div_zhengwen") - ) - ) + sourceRule, ) ) diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt index e2cc0b8..d9751fa 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt @@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.digtal.market.controller import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.sites +import com.lanyuanxiaoyao.squirrel.core.common.Site import jakarta.annotation.Resource import org.slf4j.LoggerFactory import org.springframework.data.domain.PageRequest @@ -29,8 +30,10 @@ class OverviewController { val result = articleRepository.findAll(request) return mapOf( "items" to result.content.map { - val iframe = (sites.firstOrNull { site -> site.code == it.code }?.properties?.get("iframe")?: "false").toBoolean() + val site = sites.find { site -> site.code == it.code }!! + val iframe = (site.options?.contains(Site.Option.OPEN_WITH_IFRAME) ?: false) mapOf( + "name" to site.name, "code" to it.code, "title" to it.title, "subtitle" to it.subtitle, @@ -40,6 +43,7 @@ class OverviewController { "createTime" to it.createTime, "iframe" to iframe, "category" to it.category, + "tags" to it.tags, ) }, "total" to result.totalElements, diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt index c67b3ca..319ea30 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt @@ -1,7 +1,5 @@ package com.lanyuanxiaoyao.digtal.market.controller -import cn.hutool.json.JSON -import cn.hutool.json.JSONUtil import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement import jakarta.annotation.Resource @@ -24,7 +22,7 @@ class SiteController { private lateinit var management: JvmManagement @GetMapping("list") - fun list(): List> { + fun list(): List> { val countMap = articleRepository.countGroupByCode().associate { it.key to it.count } return management.exportSites() .map { site -> @@ -34,7 +32,7 @@ class SiteController { "icon" to site.icon, "url" to site.home, "description" to site.description, - "iframe" to site.properties.containsKey("iframe"), + "iframe" to (site.properties?.containsKey("iframe") ?: false), "news" to (countMap[site.code] ?: 0) ) } diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt index 3781e43..cd133a9 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt @@ -1,25 +1,19 @@ package com.lanyuanxiaoyao.digtal.market.runner -import cn.hutool.core.date.DateUtil import cn.hutool.core.util.NumberUtil -import cn.hutool.crypto.SecureUtil -import com.lanyuanxiaoyao.digtal.market.Article import com.lanyuanxiaoyao.digtal.market.ArticleRepository import com.lanyuanxiaoyao.digtal.market.keywords +import com.lanyuanxiaoyao.digtal.market.parseArticle +import com.lanyuanxiaoyao.digtal.market.parseArticleLink import com.lanyuanxiaoyao.digtal.market.service.DescriptionService import com.lanyuanxiaoyao.digtal.market.sites import com.lanyuanxiaoyao.squirrel.core.common.Management import com.lanyuanxiaoyao.squirrel.core.common.PageParseException -import dev.failsafe.Failsafe -import dev.failsafe.FailsafeException import dev.failsafe.RetryPolicy -import dev.failsafe.function.CheckedSupplier import jakarta.annotation.Resource import java.util.concurrent.atomic.AtomicLong import kotlin.time.Duration.Companion.seconds import kotlin.time.toJavaDuration -import kotlinx.coroutines.runBlocking -import org.jsoup.Jsoup import org.slf4j.LoggerFactory import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Service @@ -54,12 +48,18 @@ class NewsRunner : Runner { .forEach { site -> logger.info("站点: {}", site.name) keywords.forEach { keyword -> - val url = site.search.replace("{query}", keyword) - logger.info("类目: {}, 地址: {}", keyword, url) - val hashList = articleRepository.findAllId() - val links = parseArticleLink(site.code, url, false) + val url = site.search?.replace("{query}", keyword) + logger.info("正在搜索: {}, 地址: {}", keyword, url) + val links = + try { + parseArticleLink(management, site.code, url, false) + } catch (e: Exception) { + logger.error("解析失败 $url", e) + emptyList() + } val total = links.size val current = AtomicLong(0) + val hashList = articleRepository.findAllId() links .filter { if (hashList.contains(it.hash)) { @@ -68,128 +68,36 @@ class NewsRunner : Runner { } else true } .forEach { link -> - Thread.sleep(500) + Thread.sleep(1000) logger.info("进度:{} 采集文章:{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url) - parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article -> - if (!article.text.isNullOrBlank()) { - val triple = descriptionService.parseDescription(article.text) - logger.info("小标题:{}", triple?.first) - logger.info("描述:{}", triple?.second) - logger.info("相关度:{}", triple?.third) - article.subtitle = triple?.first - article.description = triple?.second - article.score = triple?.third - } + try { + parseArticle(management, link) + ?.let { article -> + if (!article.text.isNullOrBlank() || !article.html.isNullOrBlank()) { + val content = if (article.text.isNullOrBlank()) article.html else article.text + val triple = descriptionService.parseDescription(content) + logger.info("标题:{}", triple?.title) + logger.info("作者:{}", triple?.author) + logger.info("副标题:{}", triple?.subtitle) + logger.info("描述:{}", triple?.description) + logger.info("标签:{}", triple?.tags) - article.category = keyword - articleRepository.save(article) + article.title = if (link.title.isNullOrBlank()) triple?.title else link.title + article.author = if (article.author.isNullOrBlank()) triple?.author else article.author + article.subtitle = triple?.subtitle + article.description = triple?.description + article.tags = triple?.tags + } + + article.category = keyword + articleRepository.save(article) + } + } catch (e: Exception) { + logger.error("解析失败 $link", e) } } } } logger.info("本轮采集完成") } - - fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? { - logger.info("Title: {} Datetime: {}", title, datetime) - val page = try { - Failsafe - .with(retryPolicy) - .get(CheckedSupplier { - runBlocking { management.parse(code, url) } - }) - } catch (e: FailsafeException) { - logger.error("Parse failure", e) - return null - } - (page["text"] as? Map<*, *>)?.let { text -> - var datetimeText = text["datetime"] as? String - val createTime = try { - if (datetimeText.isNullOrBlank()) { - datetimeText = datetime - } - if (datetimeText?.matches(Regex("\\d{10}")) == true) { - DateUtil - .date(datetimeText.toLong()) - .toTimestamp() - } else if (datetimeText?.matches(Regex("\\d{7}")) == true) { - DateUtil - .date(datetimeText.toLong() * 1000) - .toTimestamp() - } else { - DateUtil - .parse( - datetimeText, - "yyyy-MM-dd HH:mm:ss", - "yyyy-MM-dd", - "yyyy年MM月dd日 HH:mm", - "yyyy-MM-dd HH:mm", - ) - .toTimestamp() - } - } catch (e: Exception) { - logger.error("Parse $datetimeText error", e) - null - } - val source = text["source"]?.let { - val document = Jsoup.parse((it as String)) - document - .select("script") - .forEach { node -> node.remove() } - document.forEachNode { node -> node.removeAttr("style") } - document - .body() - .html() - } - return Article( - id = hash, - code = code, - url = url, - title = text["title"] as String?, - author = text["author"] as String?, - category = null, - text = if ((text["content"] as String?) == null) null else text["content"] as String, - html = source, - subtitle = null, - description = null, - score = null, - createTime = createTime, - pushed = false, - ) - } ?: return null - } - - data class ArticleLink(val url: String, val hash: String, val title: String?, val datetime: String?) - - fun parseArticleLink(code: String, url: String, recursive: Boolean = false): List { - var next: String? = url - val links = mutableListOf() - do { - logger.info("解析目录:{}", next) - val page = try { - Failsafe - .with(retryPolicy) - .get(CheckedSupplier { - runBlocking { management.parse(code, next!!) } - }) - } catch (e: FailsafeException) { - logger.error("Parse failure", e) - continue - } - @Suppress("UNCHECKED_CAST") (page["list"] as? List>)?.let { list -> - for (item in list) { - val title = item["title"] as String? - val datetime = item["datetime"] as String? - val link = item["link"] as String? - if (link.isNullOrBlank()) { - logger.warn("链接为空:{} {}", title, link) - } else { - links.add(ArticleLink(link, SecureUtil.md5(link), title, datetime)) - } - } - } - next = page["next"] as String? - } while (recursive && !next.isNullOrBlank()) - return links - } } diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt index a441e50..e0c5601 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt @@ -12,6 +12,14 @@ import kotlin.time.toJavaDuration import org.slf4j.LoggerFactory import org.springframework.stereotype.Service +data class Description( + val title: String, + val author: String, + val subtitle: String, + val description: String, + val tags: String, +) + @Service class DescriptionService { private val logger = LoggerFactory.getLogger(javaClass) @@ -26,7 +34,7 @@ class DescriptionService { QianfanChat(), ) - fun parseDescription(content: String?): Triple? { + fun parseDescription(content: String?): Description? { return content?.let { if (it.isNotBlank()) { var description: String? @@ -50,11 +58,16 @@ class DescriptionService { .replace("```json", "") .replace("```", "") ) - val subtitle = root.getByPath("title", String::class.java) + val title = root.getByPath("title", String::class.java) + val author = root.getByPath("author", String::class.java) + val subtitle = root.getByPath("subtitle", String::class.java) val desc = root.getByPath("description", String::class.java) - val score = root.getByPath("score", Int::class.java) - return@let Triple(subtitle, desc, score) + val tags = root + .getByPath("tags", String::class.java) + .replace(",", ",") + return@let Description(title, author, subtitle, desc, tags) } catch (e: Throwable) { + logger.error("json解析失败", e) continue } } diff --git a/src/main/resources/static/component/overview-tab.js b/src/main/resources/static/component/overview-tab.js index 1463fcd..02406ee 100644 --- a/src/main/resources/static/component/overview-tab.js +++ b/src/main/resources/static/component/overview-tab.js @@ -5,7 +5,7 @@ function pagination() { model: 'normal', maxButtons: 10, showPageInput: false, - perPageAvailable: [10, 15, 20], + perPageAvailable: [10, 15, 20, 50, 100], activePage: '${page|default:1}', total: '${total|default:0}', className: 'text-right', @@ -131,6 +131,11 @@ function overviewTab() { className: 'text-current', tpl: '${description}', }, + { + type: 'tpl', + className: 'text-blue-900 text-sm mt-2', + tpl: '${name}', + }, { type: 'wrapper', size: 'none', @@ -152,10 +157,21 @@ function overviewTab() { type: 'tag', label: '${author}', displayMode: 'rounded', - color: '#ff8888', + color: '#bd6464', }, ] }, + { + type: 'each', + className: 'mt-2', + source: "${SPLIT(tags, ',')}", + items: { + type: 'tag', + label: '${item}', + displayMode: 'rounded', + color: '#6b3481', + } + } ] }, }, diff --git a/src/main/resources/static/index.html b/src/main/resources/static/index.html index 1a536cd..c2bfbaf 100644 --- a/src/main/resources/static/index.html +++ b/src/main/resources/static/index.html @@ -48,7 +48,7 @@ ] } } - let debug = true + let debug = false let server = amis.embed( '#root', amisJSON, diff --git a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt index 975626b..08bfc45 100644 --- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt +++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt @@ -1,6 +1,7 @@ package com.lanyuanxiaoyao.digtal.market import cn.hutool.core.date.DateUtil +import cn.hutool.core.io.FileUtil import com.lanyuanxiaoyao.digtal.market.ai.Chat import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat @@ -144,4 +145,18 @@ class Test { ) ) } + + @Test + fun testNativeDownload() { + val downloader = BasicDownloaderFactory().build(emptyMap()) + val page = runBlocking { + downloader.download("http://zfsg.gd.gov.cn/xxfb/dtxw/content/post_4515949.html") + } + val root = Jsoup.parse(page).body() + root.select("script").forEach { it.remove() } + root.select("style").forEach { it.remove() } + root.select("link").forEach { it.remove() } + root.allElements.forEach { it.removeAttr("style") } + FileUtil.writeString(root.html(), "/Users/lanyuanxiaoyao/Project/IdeaProjects/digtal-market/source.txt", Charsets.UTF_8) + } } diff --git a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt index 978d6b3..29f9b37 100644 --- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt +++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt @@ -24,18 +24,19 @@ class TestManagement { @Test fun testParse() { - newsRunner - .parseArticleLink( - "9a7f1d8f-4f39-4120-adeb-7435339b97bb", - "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22数据要素%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D", - ) + parseArticleLink( + management, + "9a7f1d8f-4f39-4120-adeb-7435339b97bb", + "https://www.gov.cn/zhengce/content/202409/content_6977766.htm", + ) .forEach { link -> - val article = newsRunner.parseArticle( + val article = parseArticle( + management, "9a7f1d8f-4f39-4120-adeb-7435339b97bb", link.url, + "", link.title, link.datetime, - "", ) logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text) } @@ -54,26 +55,26 @@ class TestManagement { @Test fun testParseList() { - newsRunner - .parseArticleLink( - "cee7f242-668b-41fb-adbc-96fb27d4bf35", - "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20", - true, - ) + parseArticleLink( + management, + "cee7f242-668b-41fb-adbc-96fb27d4bf35", + "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20", + true, + ) // .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } .let { it.forEach { logger.info("{}", it.url) } } } @Test fun testParseArticle() { - newsRunner - .parseArticle( - "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be", - "https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm", - "工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知", - "2024-11-01 12:48:26", - "", - ) + parseArticle( + management, + "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be", + "https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm", + "", + "工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知", + "2024-11-01 12:48:26", + ) ?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) } } } \ No newline at end of file diff --git a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt index f58c8c0..70d9ae2 100644 --- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt +++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt @@ -14,8 +14,8 @@ class TestRule { private lateinit var management: Management private val link = - "https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html" - private val site = 北京市政务服务和数据管理局 + "https://www.gov.cn/zhengce/202410/content_6979047.htm" + private val site = sites.find { it.code == "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be" }!! @BeforeTest fun before() { diff --git a/src/test/resources/test.http b/src/test/resources/test.http index 36685f8..bc7fed4 100644 --- a/src/test/resources/test.http +++ b/src/test/resources/test.http @@ -70,4 +70,17 @@ Content-Type: application/json } ### Search -POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20 \ No newline at end of file +POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20 + +### Search +POST https://www.cantonde.com/si/common/searchInfo +Content-Type: application/json + +{ + "NAME": "数据要素", + "IN_CATEGORY": "", + "NOT_IN_CATEGORY": "", + "CATEGORY": "", + "pageNo": 1, + "pageSize": 10 +}