feat: 适配squirrel改造

增加AI解析页面、优化解析流程
2024-11-06 17:27:05 +08:00
parent 3bf399f7aa
commit 14e84a0d4c
13 changed files with 319 additions and 354 deletions
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Entity.kt
@@ -24,7 +24,7 @@ class Article(
    @Column(columnDefinition = "longtext") var html: String?,
    var subtitle: String?,
    @Column(columnDefinition = "longtext") var description: String?,
-    var score: Int?,
+    var tags: String?,
    var createTime: Date?,
    var pushed: Boolean?,
 )
@@ -41,7 +41,7 @@ interface ArticleRepository : JpaRepository<Article, String>, JpaSpecificationEx
    @Query("update Article article set article.pushed = :pushed where article.id = :id")
    fun updatePushedById(@Param("id") id: String, @Param("pushed") pushed: Boolean)

-    @Query("select article.id from Article article where article.description is not null and article.subtitle is not null and article.score is not null")
+    @Query("select article.id from Article article where article.description is not null and article.text is not null and article.text <> ''")
    fun findAllId(): List<String>

    @Query("select new com.lanyuanxiaoyao.digtal.market.CountGroupByString(article.code, count(article.code)) from Article article group by article.code")
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Helper.kt
@@ -0,0 +1,153 @@
+package com.lanyuanxiaoyao.digtal.market
+
+import cn.hutool.core.date.DateUtil
+import cn.hutool.crypto.SecureUtil
+import com.lanyuanxiaoyao.squirrel.core.common.Management
+import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
+import dev.failsafe.Failsafe
+import dev.failsafe.FailsafeException
+import dev.failsafe.RetryPolicy
+import dev.failsafe.function.CheckedSupplier
+import kotlin.time.Duration.Companion.seconds
+import kotlin.time.toJavaDuration
+import kotlinx.coroutines.runBlocking
+import org.jsoup.Jsoup
+import org.slf4j.LoggerFactory
+
+data class ArticleLink(
+    val code: String,
+    val url: String,
+    val hash: String,
+    val title: String?,
+    val datetime: String?,
+)
+
+private val logger = LoggerFactory.getLogger("Helper")
+
+private val retryPolicy = RetryPolicy
+    .builder<Any>()
+    .withDelay(10.seconds.toJavaDuration())
+    .withMaxRetries(2)
+    .handleIf { e -> (e is PageParseException).not() }
+    .build()
+
+@Suppress("UNCHECKED_CAST")
+fun parseArticleLink(management: Management, code: String, url: String?, recursive: Boolean = false): List<ArticleLink> {
+    var next: String? = url
+    val links = mutableListOf<ArticleLink>()
+    do {
+        val page = try {
+            Failsafe
+                .with(retryPolicy)
+                .get(CheckedSupplier {
+                    runBlocking { management.parse(code, next!!) }
+                })
+        } catch (e: FailsafeException) {
+            logger.error("解析失败：${next}", e)
+            continue
+        }
+        (page["list"] as? List<Map<String, Any>>)
+            ?.let { list ->
+                for (item in list) {
+                    val link = item["link"] as String?
+                    if (link.isNullOrBlank()) {
+                        logger.warn("链接为空：{} {}", item["title"] as String?, next)
+                    } else {
+                        links.add(
+                            ArticleLink(
+                                code,
+                                link,
+                                SecureUtil.md5(link),
+                                item["title"] as String?,
+                                item["datetime"] as String?,
+                            )
+                        )
+                    }
+                }
+            }
+        next = page["next"] as String?
+    } while (recursive && !next.isNullOrBlank())
+    return links
+}
+
+fun parseArticle(management: Management, articleLink: ArticleLink): Article? {
+    val (code, url, hash, title, datetime) = articleLink
+    return parseArticle(management, code, url, hash, title, datetime)
+}
+
+fun parseArticle(management: Management, code: String, url: String, hash: String, title: String?, datetime: String?): Article? {
+    logger.info("标题：{} 时间：{}", title, datetime)
+    val page = try {
+        Failsafe
+            .with(retryPolicy)
+            .get(CheckedSupplier {
+                runBlocking { management.parse(code, url) }
+            })
+    } catch (e: FailsafeException) {
+        logger.error("解析失败：${url}", e)
+        return null
+    }
+    (page["text"] as? Map<*, *>)?.let { text ->
+        var datetimeText = text["datetime"] as? String
+        val createTime = try {
+            if (datetimeText.isNullOrBlank()) {
+                datetimeText = datetime
+            }
+            if (datetimeText?.matches(Regex("\\d{13}")) == true) {
+                DateUtil
+                    .date(datetimeText.toLong())
+                    .toTimestamp()
+            } else if (datetimeText?.matches(Regex("\\d{10}")) == true) {
+                DateUtil
+                    .date(datetimeText.toLong() * 1000)
+                    .toTimestamp()
+            } else {
+                DateUtil
+                    .parse(
+                        datetimeText,
+                        "yyyy-MM-dd HH:mm:ss",
+                        "yyyy-MM-dd",
+                        "yyyy年MM月dd日 HH:mm",
+                        "yyyy-MM-dd HH:mm",
+                    )
+                    .toTimestamp()
+            }
+        } catch (e: Exception) {
+            logger.error("解析 $datetimeText 失败", e)
+            null
+        }
+        val sourceText = (text["source"] as? String) ?: ""
+        val source = if (sourceText.isBlank()) {
+            sourceText
+        } else {
+            sourceText.let {
+                val document = Jsoup.parse((it as String))
+                document
+                    .select("script")
+                    .forEach { it.remove() }
+                document
+                    .select("link")
+                    .forEach { it.remove() }
+                document.forEachNode { node -> node.removeAttr("style") }
+                document
+                    .body()
+                    .html()
+            }
+        }
+        return Article(
+            id = hash,
+            code = code,
+            url = url,
+            title = null,
+            author = null,
+            category = null,
+            text = text["content"] as String? ?: "",
+            html = source,
+            subtitle = null,
+            description = null,
+            tags = null,
+            createTime = createTime,
+            pushed = false,
+        )
+    } ?: return null
+}
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt
@@ -11,11 +11,11 @@ import com.lanyuanxiaoyao.squirrel.core.common.Rule
 import com.lanyuanxiaoyao.squirrel.core.common.Script
 import com.lanyuanxiaoyao.squirrel.core.common.Selector
 import com.lanyuanxiaoyao.squirrel.core.common.Site
+import com.lanyuanxiaoyao.squirrel.core.common.Site.Option.Companion.OPEN_WITH_IFRAME

-private val html = Pair("html", "true")
-private val iframe = Pair("iframe", "true")
-private val post = Pair("post", "true")
-private val form = Pair("form", "true")
+private val html = "html" to "true"
+private val post = "post" to "true"
+private val form = "form" to "true"

 // language=regexp
 private val commonRemove = listOf(
@@ -23,19 +23,29 @@ private val commonRemove = listOf(
    "&.+?;"
 )

+private val sourceRule =
+    // language=regexp
+    "https*://.+" to Rule(
+        downloader = Downloader.Type.BROWSER,
+        parser = Parser.Type.CSS,
+        text = Content(
+            expression = "body",
+            content = Selector(":root"),
+            extra = mapOf(
+                "source" to Selector(":root", properties = mapOf(html))
+            ),
+        )
+    )
+
 private val 广东政务服务和数据管理局 = Site(
    code = "74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
    name = "广东政务服务和数据管理局",
    home = "https://zfsg.gd.gov.cn",
-    parser = Parser.Type.CSS,
    author = "lanyuanxiaoyao",
-    target = Site.Target.SEARCH,
-    downloader = Downloader.Type.BROWSER,
    search = "https://search.gd.gov.cn/api/search/all?json=%7B%22page%22%3A1%2C%22keywords%22%3A%22{query}%22%2C%22advance%22%3A%22true%22%2C%22sort%22%3A%22time%22%2C%22position%22%3A%22all%22%2C%22time_to%22%3A2524579200%2C%22time_from%22%3A189273600%2C%22site_id%22%3A%22246%22%2C%22range%22%3A%22site%22%2C%22recommand%22%3A1%7D",
    rules = mapOf(
        // language=regexp
        "https://search.gd.gov.cn/api/search/all.*" to Rule(
-            downloader = Downloader.Type.HTTP,
            parser = Parser.Type.JSON,
            list = Content(
                expression = "$.data.news.list",
@@ -57,71 +67,7 @@ private val 广东政务服务和数据管理局 = Site(
                )
            )
        ),
-        // language=regexp
-        "https*://zfsg\\.gd\\.gov\\.cn/gkmlpt/content/.*/post_\\d+\\.html.*" to Rule(
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = ".content-container",
-                title = Selector(".content-box .content h1.title"),
-                author = Selector("td.first:contains(发布机构) + td > span"),
-                dateTime = Selector("td.second:contains(成文日期) + td > span"),
-                content = Selector(".content .article-content"),
-                extra = mapOf(
-                    "source" to Selector(".content .article-content", properties = mapOf(html))
-                ),
-            )
-        ),
-        // language=regexp
-        "https*://zfsg\\.gd\\.gov\\.cn/(xxfb|zwgk|ztzl)/.*content/post_\\d+\\.html" to Rule(
-            text = Content(
-                expression = ".Con",
-                title = Selector("h3.zw-title"),
-                author = Selector(".zw-info > .ly", process = Process(remove = listOf("来源\\s*:\\s*"))),
-                dateTime = Selector(".zw-info > .time", process = Process(remove = listOf("时间\\s*:\\s*"))),
-                content = Selector(".zw"),
-                extra = mapOf(
-                    "source" to Selector(".zw", properties = mapOf(html))
-                )
-            )
-        ),
-        // language=regexp
-        "https://mp\\.weixin\\.qq\\.com/s/.+" to Rule(
-            text = Content(
-                expression = "#page-content",
-                title = Selector("#activity-name"),
-                author = Selector("#js_name"),
-                dateTime = Selector("#publish_time"),
-                content = Selector("#js_content"),
-                extra = mapOf(
-                    "source" to Selector("#js_content", properties = mapOf(html))
-                )
-            )
-        ),
-        // language=regexp
-        "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/yjzj/answer/.+" to Rule(
-            text = Content(
-                expression = "#article-container",
-                title = Selector("#article-title"),
-                author = Selector("#article-source", process = Process(remove = listOf("发布机构："))),
-                dateTime = Selector("#article-published_at", process = Process(remove = listOf("发布时间："))),
-                content = Selector("#article-content"),
-                extra = mapOf(
-                    "source" to Selector("#article-content", properties = mapOf(html))
-                )
-            )
-        ),
-        // language=regexp
-        "https*://zfsg\\.gd\\.gov\\.cn/hdjlpt/live/index.php\\?pid=\\d+" to Rule(
-            text = Content(
-                expression = "body",
-                title = Selector(".title-page .txt > span"),
-                author = Selector(process = Process(default = "广东政务服务和数据管理局")),
-                content = Selector(".content"),
-                extra = mapOf(
-                    "source" to Selector(".content", properties = mapOf(html))
-                )
-            )
-        ),
+        sourceRule,
    )
 )

@@ -130,11 +76,8 @@ private val 深圳市政务服务和数据管理局 = Site(
    name = "深圳市政务服务和数据管理局",
    home = "https://www.sz.gov.cn/szzsj/gkmlpt/index",
    icon = "https://www.sz.gov.cn/favicon.ico",
-    parser = Parser.Type.CSS,
    author = "lanyuanxiaoyao",
-    target = Site.Target.TEXT,
-    downloader = Downloader.Type.HTTP,
-    properties = mapOf(iframe),
+    options = listOf(OPEN_WITH_IFRAME),
    search = "https://search.gd.gov.cn/jsonp/site/755576?callback=getResult&page=1&pagesize=20&text={query}&order=1&position=all",
    rules = mapOf(
        // language=regexp
@@ -179,20 +122,7 @@ private val 深圳市政务服务和数据管理局 = Site(
                )
            )
        ),
-        // language=regexp
-        "https*://www\\.sz\\.gov\\.cn/szzsj/gkmlpt/content/\\d+/\\d+/post_\\d+\\.html" to Rule(
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = ".content-container",
-                title = Selector(".content-box .content h1.title"),
-                author = Selector("td.first:contains(发布机构) + td > span"),
-                dateTime = Selector("td.second:contains(成文日期) + td > span"),
-                content = Selector(".content .article-content"),
-                extra = mapOf(
-                    "source" to Selector(".content .article-content", properties = mapOf(html))
-                ),
-            )
-        )
+        sourceRule,
    )
 )

@@ -201,12 +131,9 @@ private val 中华人民共和国中央人民政府 = Site(
    name = "中华人民共和国中央人民政府",
    home = "https://www.gov.cn",
    icon = "https://www.gov.cn/favicon.ico",
-    parser = Parser.Type.CSS,
    author = "lanyuanxiaoyao",
-    target = Site.Target.TEXT,
-    downloader = Downloader.Type.HTTP,
-    properties = mapOf(iframe),
    search = "https://sousuoht.www.gov.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE?json=%7B%22code%22%3A%2217da70961a7%22%2C%22historySearchWords%22%3A%5B%5D%2C%22dataTypeId%22%3A%22107%22%2C%22orderBy%22%3A%22time%22%2C%22searchBy%22%3A%22title%22%2C%22appendixType%22%3A%22%22%2C%22granularity%22%3A%22ALL%22%2C%22trackTotalHits%22%3Atrue%2C%22beginDateTime%22%3A%22%22%2C%22endDateTime%22%3A%22%22%2C%22isSearchForced%22%3A0%2C%22filters%22%3A%5B%5D%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%2C%22customFilter%22%3A%7B%22operator%22%3A%22and%22%2C%22properties%22%3A%5B%5D%7D%2C%22searchWord%22%3A%22{query}%22%7D",
+    options = listOf(OPEN_WITH_IFRAME),
    rules = mapOf(
        // language=regexp
        "https://sousuoht\\.www\\.gov\\.cn/athena/forward/2B22E8E39E850E17F95A016A74FCB6B673336FA8B6FEC0E2955907EF9AEE06BE.+" to Rule(
@@ -216,10 +143,10 @@ private val 中华人民共和国中央人民政府 = Site(
                "Athenaappkey" to "be5qL61qyHKQ87BVlqy%2BwloIkpcL1tCqC4yOM%2F%2BsqaSn0NxuA4GPOWocQu96h0PUaYBe9hz3DVvjGTECMNNou6cjAlUc3UoJlviQAipc4Ha8qxmIeS1sPqdGQ3Unm49j%2BsYN5T%2BzjqKxdA2yz5qLcG5wjZR7rqYVgT98NasByMA%3D",
                "Athenaappname" to "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
            ),
-            properties = mapOf("post" to "true"),
+            properties = mapOf(post),
            list = Content(
                expression = "$.result.data.middle.list",
-                title = Selector("$.title_no_tag"),
+                title = Selector("$.title_no_tag", process = Process(remove = commonRemove)),
                dateTime = Selector("$.time"),
                link = Selector("$.url"),
            ),
@@ -236,51 +163,7 @@ private val 中华人民共和国中央人民政府 = Site(
                )
            )
        ),
-        // language=regexp
-        "https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule(
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = ".policyLibraryOverview_content",
-                author = Selector("td:contains(源：) + td"),
-                content = Selector(".pages_content"),
-                extra = mapOf(
-                    "source" to Selector(".pages_content", properties = mapOf("html" to "true"))
-                )
-            )
-        ),
-        // language=regexp
-        "https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = ".content .article",
-                title = Selector("h1#ti"),
-                author = Selector(
-                    ".pages-date > .font",
-                    process = Process(
-                        default = "中华人民共和国中央人民政府",
-                        remove = listOf("来源：")
-                    )
-                ),
-                dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
-                content = Selector(".pages_content"),
-                extra = mapOf(
-                    "source" to Selector(".pages_content", properties = mapOf("html" to "true"))
-                )
-            )
-        ),
-        // language=regexp
-        "https://xcx\\.www\\.gov\\.cn/robot/gwypolicy/#/qaReadDetails.+" to Rule(
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = ".main-content",
-                title = Selector(".qa_content_box"),
-                author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源："))),
-                content = Selector(".qa_content_text"),
-                extra = mapOf(
-                    "source" to Selector(".main-content", properties = mapOf("html" to "true"))
-                )
-            )
-        ),
+        sourceRule,
    )
 )

@@ -289,12 +172,10 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
    name = "中华人民共和国国家互联网信息办公室",
    home = "https://www.cac.gov.cn",
    icon = "https://www.cac.gov.cn/favicon.ico",
-    parser = Parser.Type.CSS,
    author = "lanyuanxiaoyao",
-    target = Site.Target.TEXT,
-    downloader = Downloader.Type.HTTP,
    search = "https://search.cac.gov.cn/cms/cmsadmin/infopub/gjjs.jsp?templetid=1563339473064626&pubtype=S&pubpath=portal&page=1&webappcode=A09&huopro={query}&mustpro=&notpro=&inpro=&startDate=\$\$\$&endDate=\$\$\$&sort=1&searchfield=\$\$\$&searchdir=A09",
    rules = mapOf(
+        // language=regexp
        "https://search\\.cac\\.gov\\.cn/cms/cmsadmin/infopub/gjjs\\.jsp\\?templetid=1563339473064626&pubtype=S&pubpath=portal&page=\\d*&webappcode=A09&huopro=.+&mustpro=&notpro=&inpro=&startDate=\\\$*&endDate=\\\$*&sort=1&searchfield=\\\$*&searchdir=A09" to Rule(
            list = Content(
                expression = ".xpage-container .list-item",
@@ -302,24 +183,9 @@ private val 中华人民共和国国家互联网信息办公室 = Site(
                dateTime = Selector(".search_time"),
                link = Selector("a", "href", process = Process(prefix = "https:"))
            ),
-            next = Selector(
-                ".xpage-pagination .xpage-pagination-next a:contains(下一页)",
-                "href",
-                Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/")
-            )
+            next = Selector(".xpage-pagination .xpage-pagination-next a:contains(下一页)", "href", Process(prefix = "https://search.cac.gov.cn/cms/cmsadmin/infopub/"))
        ),
-        "https://www\\.cac\\.gov\\.cn/.+/c_\\d+\\.htm" to Rule(
-            text = Content(
-                expression = ".main",
-                title = Selector("h1.title"),
-                author = Selector("#source", process = Process(remove = listOf("来源："))),
-                dateTime = Selector("#pubtime"),
-                content = Selector(".main-content"),
-                extra = mapOf(
-                    "source" to Selector(".main", properties = mapOf("html" to "true"))
-                )
-            )
-        )
+        sourceRule,
    )
 )

@@ -330,7 +196,6 @@ val 广州数据交易所 = Site(
    home = "https://www.cantonde.com",
    icon = "https://www.cantonde.com/favicon.ico",
    description = "广州数据交易所是广东省深入贯彻落实党中央、国务院关于加快培育数据要素市场，助力数字经济高质量发展工作部署，高标准建设的新型数据交易场所。旨在为市场主体提供合规安全、集约高效的数据流通交易综合性服务。广州数据交易所作为广东省数据要素市场体系的核心枢纽，是畅通数据要素大循环的关键举措，也是推进数据要素市场化配置改革的重要载体。",
-    target = Site.Target.SEARCH,
    parser = Parser.Type.JSON,
    search = "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22{query}%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
    rules = mapOf(
@@ -346,7 +211,7 @@ val 广州数据交易所 = Site(
                            Script(
                                Script.Type.Javascript,
                                // language=javascript
-                                "return `https://www.cantonde.com/si/info/detail?json=%7B%22id%22%3A%22\${text}%22%7D`",
+                                "return `https://www.cantonde.com/info.html#/infoDetail?id=\${text}`",
                            )
                        )
                    )
@@ -365,26 +230,16 @@ val 广州数据交易所 = Site(
                )
            ),
        ),
-        // language=regexp
-        "https://www\\.cantonde\\.com/si/info/detail\\?json=.+" to Rule(
-            text = Content(
-                expression = "$.data",
-                title = Selector("$.TITLE"),
-                content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
-                author = Selector(process = Process(default = "广州数据交易所"))
-            ),
-            properties = mapOf(post),
-        ),
+        sourceRule,
    ),
 )

-val 北京市政务服务和数据管理局 = Site(
+private val 北京市政务服务和数据管理局 = Site(
    code = "cee7f242-668b-41fb-adbc-96fb27d4bf35",
    name = "北京市政务服务和数据管理局",
    author = "lanyuanxiaoyao",
    home = "https://zwfwj.beijing.gov.cn",
    icon = "https://zwfwj.beijing.gov.cn/favicon.ico",
-    target = Site.Target.SEARCH,
    parser = Parser.Type.JSON,
    search = "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"{query}\")&sort=dateDesc&siteCode=1100000248&tab=all&page=1&pageSize=20",
    rules = mapOf(
@@ -420,18 +275,7 @@ val 北京市政务服务和数据管理局 = Site(
                )
            )
        ),
-        // language=regexp
-        "https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
-            parser = Parser.Type.CSS,
-            downloader = Downloader.Type.BROWSER,
-            text = Content(
-                expression = "#main .details_page",
-                title = Selector("h1"),
-                author = Selector(".article-info .ly", process = Process(remove = listOf("来源："))),
-                dateTime = Selector(".article-info span:contains(时间)", process = Process(remove = listOf("时间："))),
-                content = Selector("#div_zhengwen")
-            )
-        )
+        sourceRule,
    )
 )

--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/OverviewController.kt
@@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.digtal.market.controller

 import com.lanyuanxiaoyao.digtal.market.ArticleRepository
 import com.lanyuanxiaoyao.digtal.market.sites
+import com.lanyuanxiaoyao.squirrel.core.common.Site
 import jakarta.annotation.Resource
 import org.slf4j.LoggerFactory
 import org.springframework.data.domain.PageRequest
@@ -29,8 +30,10 @@ class OverviewController {
        val result = articleRepository.findAll(request)
        return mapOf(
            "items" to result.content.map {
-                val iframe = (sites.firstOrNull { site -> site.code == it.code }?.properties?.get("iframe")?: "false").toBoolean()
+                val site = sites.find { site -> site.code == it.code }!!
+                val iframe = (site.options?.contains(Site.Option.OPEN_WITH_IFRAME) ?: false)
                mapOf(
+                    "name" to site.name,
                    "code" to it.code,
                    "title" to it.title,
                    "subtitle" to it.subtitle,
@@ -40,6 +43,7 @@ class OverviewController {
                    "createTime" to it.createTime,
                    "iframe" to iframe,
                    "category" to it.category,
+                    "tags" to it.tags,
                )
            },
            "total" to result.totalElements,
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/controller/SiteController.kt
@@ -1,7 +1,5 @@
 package com.lanyuanxiaoyao.digtal.market.controller

-import cn.hutool.json.JSON
-import cn.hutool.json.JSONUtil
 import com.lanyuanxiaoyao.digtal.market.ArticleRepository
 import com.lanyuanxiaoyao.squirrel.core.jvm.JvmManagement
 import jakarta.annotation.Resource
@@ -24,7 +22,7 @@ class SiteController {
    private lateinit var management: JvmManagement

    @GetMapping("list")
-    fun list(): List<Map<String, Any>> {
+    fun list(): List<Map<String, Any?>> {
        val countMap = articleRepository.countGroupByCode().associate { it.key to it.count }
        return management.exportSites()
            .map { site ->
@@ -34,7 +32,7 @@ class SiteController {
                    "icon" to site.icon,
                    "url" to site.home,
                    "description" to site.description,
-                    "iframe" to site.properties.containsKey("iframe"),
+                    "iframe" to (site.properties?.containsKey("iframe") ?: false),
                    "news" to (countMap[site.code] ?: 0)
                )
            }
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt
@@ -1,25 +1,19 @@
 package com.lanyuanxiaoyao.digtal.market.runner

-import cn.hutool.core.date.DateUtil
 import cn.hutool.core.util.NumberUtil
-import cn.hutool.crypto.SecureUtil
-import com.lanyuanxiaoyao.digtal.market.Article
 import com.lanyuanxiaoyao.digtal.market.ArticleRepository
 import com.lanyuanxiaoyao.digtal.market.keywords
+import com.lanyuanxiaoyao.digtal.market.parseArticle
+import com.lanyuanxiaoyao.digtal.market.parseArticleLink
 import com.lanyuanxiaoyao.digtal.market.service.DescriptionService
 import com.lanyuanxiaoyao.digtal.market.sites
 import com.lanyuanxiaoyao.squirrel.core.common.Management
 import com.lanyuanxiaoyao.squirrel.core.common.PageParseException
-import dev.failsafe.Failsafe
-import dev.failsafe.FailsafeException
 import dev.failsafe.RetryPolicy
-import dev.failsafe.function.CheckedSupplier
 import jakarta.annotation.Resource
 import java.util.concurrent.atomic.AtomicLong
 import kotlin.time.Duration.Companion.seconds
 import kotlin.time.toJavaDuration
-import kotlinx.coroutines.runBlocking
-import org.jsoup.Jsoup
 import org.slf4j.LoggerFactory
 import org.springframework.scheduling.annotation.Scheduled
 import org.springframework.stereotype.Service
@@ -54,12 +48,18 @@ class NewsRunner : Runner {
            .forEach { site ->
                logger.info("站点: {}", site.name)
                keywords.forEach { keyword ->
-                    val url = site.search.replace("{query}", keyword)
-                    logger.info("类目: {}, 地址: {}", keyword, url)
-                    val hashList = articleRepository.findAllId()
-                    val links = parseArticleLink(site.code, url, false)
+                    val url = site.search?.replace("{query}", keyword)
+                    logger.info("正在搜索: {}, 地址: {}", keyword, url)
+                    val links =
+                        try {
+                            parseArticleLink(management, site.code, url, false)
+                        } catch (e: Exception) {
+                            logger.error("解析失败 $url", e)
+                            emptyList()
+                        }
                    val total = links.size
                    val current = AtomicLong(0)
+                    val hashList = articleRepository.findAllId()
                    links
                        .filter {
                            if (hashList.contains(it.hash)) {
@@ -68,128 +68,36 @@ class NewsRunner : Runner {
                            } else true
                        }
                        .forEach { link ->
-                            Thread.sleep(500)
+                            Thread.sleep(1000)
                            logger.info("进度：{} 采集文章：{} {}", NumberUtil.formatPercent(current.addAndGet(1) * 1.0 / total, 2), link.title, link.url)
-                            parseArticle(site.code, link.url, link.title, link.datetime, link.hash)?.let { article ->
-                                if (!article.text.isNullOrBlank()) {
-                                    val triple = descriptionService.parseDescription(article.text)
-                                    logger.info("小标题：{}", triple?.first)
-                                    logger.info("描述：{}", triple?.second)
-                                    logger.info("相关度：{}", triple?.third)
-                                    article.subtitle = triple?.first
-                                    article.description = triple?.second
-                                    article.score = triple?.third
-                                }
+                            try {
+                                parseArticle(management, link)
+                                    ?.let { article ->
+                                        if (!article.text.isNullOrBlank() || !article.html.isNullOrBlank()) {
+                                            val content = if (article.text.isNullOrBlank()) article.html else article.text
+                                            val triple = descriptionService.parseDescription(content)
+                                            logger.info("标题：{}", triple?.title)
+                                            logger.info("作者：{}", triple?.author)
+                                            logger.info("副标题：{}", triple?.subtitle)
+                                            logger.info("描述：{}", triple?.description)
+                                            logger.info("标签：{}", triple?.tags)

-                                article.category = keyword
-                                articleRepository.save(article)
+                                            article.title = if (link.title.isNullOrBlank()) triple?.title else link.title
+                                            article.author = if (article.author.isNullOrBlank()) triple?.author else article.author
+                                            article.subtitle = triple?.subtitle
+                                            article.description = triple?.description
+                                            article.tags = triple?.tags
+                                        }
+
+                                        article.category = keyword
+                                        articleRepository.save(article)
+                                    }
+                            } catch (e: Exception) {
+                                logger.error("解析失败 $link", e)
                            }
                        }
                }
            }
        logger.info("本轮采集完成")
    }
-
-    fun parseArticle(code: String, url: String, title: String?, datetime: String?, hash: String): Article? {
-        logger.info("Title: {} Datetime: {}", title, datetime)
-        val page = try {
-            Failsafe
-                .with(retryPolicy)
-                .get(CheckedSupplier {
-                    runBlocking { management.parse(code, url) }
-                })
-        } catch (e: FailsafeException) {
-            logger.error("Parse failure", e)
-            return null
-        }
-        (page["text"] as? Map<*, *>)?.let { text ->
-            var datetimeText = text["datetime"] as? String
-            val createTime = try {
-                if (datetimeText.isNullOrBlank()) {
-                    datetimeText = datetime
-                }
-                if (datetimeText?.matches(Regex("\\d{10}")) == true) {
-                    DateUtil
-                        .date(datetimeText.toLong())
-                        .toTimestamp()
-                } else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
-                    DateUtil
-                        .date(datetimeText.toLong() * 1000)
-                        .toTimestamp()
-                } else {
-                    DateUtil
-                        .parse(
-                            datetimeText,
-                            "yyyy-MM-dd HH:mm:ss",
-                            "yyyy-MM-dd",
-                            "yyyy年MM月dd日 HH:mm",
-                            "yyyy-MM-dd HH:mm",
-                        )
-                        .toTimestamp()
-                }
-            } catch (e: Exception) {
-                logger.error("Parse $datetimeText error", e)
-                null
-            }
-            val source = text["source"]?.let {
-                val document = Jsoup.parse((it as String))
-                document
-                    .select("script")
-                    .forEach { node -> node.remove() }
-                document.forEachNode { node -> node.removeAttr("style") }
-                document
-                    .body()
-                    .html()
-            }
-            return Article(
-                id = hash,
-                code = code,
-                url = url,
-                title = text["title"] as String?,
-                author = text["author"] as String?,
-                category = null,
-                text = if ((text["content"] as String?) == null) null else text["content"] as String,
-                html = source,
-                subtitle = null,
-                description = null,
-                score = null,
-                createTime = createTime,
-                pushed = false,
-            )
-        } ?: return null
-    }
-
-    data class ArticleLink(val url: String, val hash: String, val title: String?, val datetime: String?)
-
-    fun parseArticleLink(code: String, url: String, recursive: Boolean = false): List<ArticleLink> {
-        var next: String? = url
-        val links = mutableListOf<ArticleLink>()
-        do {
-            logger.info("解析目录：{}", next)
-            val page = try {
-                Failsafe
-                    .with(retryPolicy)
-                    .get(CheckedSupplier {
-                        runBlocking { management.parse(code, next!!) }
-                    })
-            } catch (e: FailsafeException) {
-                logger.error("Parse failure", e)
-                continue
-            }
-            @Suppress("UNCHECKED_CAST") (page["list"] as? List<Map<String, Any>>)?.let { list ->
-                for (item in list) {
-                    val title = item["title"] as String?
-                    val datetime = item["datetime"] as String?
-                    val link = item["link"] as String?
-                    if (link.isNullOrBlank()) {
-                        logger.warn("链接为空：{} {}", title, link)
-                    } else {
-                        links.add(ArticleLink(link, SecureUtil.md5(link), title, datetime))
-                    }
-                }
-            }
-            next = page["next"] as String?
-        } while (recursive && !next.isNullOrBlank())
-        return links
-    }
 }
--- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt
+++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/service/DescriptionService.kt
@@ -12,6 +12,14 @@ import kotlin.time.toJavaDuration
 import org.slf4j.LoggerFactory
 import org.springframework.stereotype.Service

+data class Description(
+    val title: String,
+    val author: String,
+    val subtitle: String,
+    val description: String,
+    val tags: String,
+)
+
@Service
 class DescriptionService {
    private val logger = LoggerFactory.getLogger(javaClass)
@@ -26,7 +34,7 @@ class DescriptionService {
        QianfanChat(),
    )

-    fun parseDescription(content: String?): Triple<String, String, Int>? {
+    fun parseDescription(content: String?): Description? {
        return content?.let {
            if (it.isNotBlank()) {
                var description: String?
@@ -50,11 +58,16 @@ class DescriptionService {
                                .replace("```json", "")
                                .replace("```", "")
                        )
-                        val subtitle = root.getByPath("title", String::class.java)
+                        val title = root.getByPath("title", String::class.java)
+                        val author = root.getByPath("author", String::class.java)
+                        val subtitle = root.getByPath("subtitle", String::class.java)
                        val desc = root.getByPath("description", String::class.java)
-                        val score = root.getByPath("score", Int::class.java)
-                        return@let Triple(subtitle, desc, score)
+                        val tags = root
+                            .getByPath("tags", String::class.java)
+                            .replace("，", ",")
+                        return@let Description(title, author, subtitle, desc, tags)
                    } catch (e: Throwable) {
+                        logger.error("json解析失败", e)
                        continue
                    }
                }
--- a/src/main/resources/static/component/overview-tab.js
+++ b/src/main/resources/static/component/overview-tab.js
@@ -5,7 +5,7 @@ function pagination() {
        model: 'normal',
        maxButtons: 10,
        showPageInput: false,
-        perPageAvailable: [10, 15, 20],
+        perPageAvailable: [10, 15, 20, 50, 100],
        activePage: '${page|default:1}',
        total: '${total|default:0}',
        className: 'text-right',
@@ -131,6 +131,11 @@ function overviewTab() {
                                className: 'text-current',
                                tpl: '${description}',
                            },
+                            {
+                                type: 'tpl',
+                                className: 'text-blue-900 text-sm mt-2',
+                                tpl: '${name}',
+                            },
                            {
                                type: 'wrapper',
                                size: 'none',
@@ -152,10 +157,21 @@ function overviewTab() {
                                        type: 'tag',
                                        label: '${author}',
                                        displayMode: 'rounded',
-                                        color: '#ff8888',
+                                        color: '#bd6464',
                                    },
                                ]
                            },
+                            {
+                                type: 'each',
+                                className: 'mt-2',
+                                source: "${SPLIT(tags, ',')}",
+                                items: {
+                                    type: 'tag',
+                                    label: '${item}',
+                                    displayMode: 'rounded',
+                                    color: '#6b3481',
+                                }
+                            }
                        ]
                    },
                },
--- a/src/main/resources/static/index.html
+++ b/src/main/resources/static/index.html
@@ -48,7 +48,7 @@
                ]
            }
        }
-        let debug = true
+        let debug = false
        let server = amis.embed(
            '#root',
            amisJSON,
--- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt
+++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/Test.kt
@@ -1,6 +1,7 @@
 package com.lanyuanxiaoyao.digtal.market

 import cn.hutool.core.date.DateUtil
+import cn.hutool.core.io.FileUtil
 import com.lanyuanxiaoyao.digtal.market.ai.Chat
 import com.lanyuanxiaoyao.digtal.market.ai.QianfanChat
 import com.lanyuanxiaoyao.digtal.market.ai.ZhipuChat
@@ -144,4 +145,18 @@ class Test {
            )
        )
    }
+
+    @Test
+    fun testNativeDownload() {
+        val downloader = BasicDownloaderFactory().build(emptyMap())
+        val page = runBlocking {
+            downloader.download("http://zfsg.gd.gov.cn/xxfb/dtxw/content/post_4515949.html")
+        }
+        val root = Jsoup.parse(page).body()
+        root.select("script").forEach { it.remove() }
+        root.select("style").forEach { it.remove() }
+        root.select("link").forEach { it.remove() }
+        root.allElements.forEach { it.removeAttr("style") }
+        FileUtil.writeString(root.html(), "/Users/lanyuanxiaoyao/Project/IdeaProjects/digtal-market/source.txt", Charsets.UTF_8)
+    }
 }
--- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt
+++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestManagement.kt
@@ -24,18 +24,19 @@ class TestManagement {

    @Test
    fun testParse() {
-        newsRunner
-            .parseArticleLink(
-                "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
-                "https://www.cantonde.com/si/common/searchInfo?json=%7B%22NAME%22%3A%22数据要素%22%2C%22IN_CATEGORY%22%3A%22%22%2C%22NOT_IN_CATEGORY%22%3A%22%22%2C%22CATEGORY%22%3A%22%22%2C%22pageNo%22%3A1%2C%22pageSize%22%3A10%7D",
-            )
+        parseArticleLink(
+            management,
+            "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
+            "https://www.gov.cn/zhengce/content/202409/content_6977766.htm",
+        )
            .forEach { link ->
-                val article = newsRunner.parseArticle(
+                val article = parseArticle(
+                    management,
                    "9a7f1d8f-4f39-4120-adeb-7435339b97bb",
                    link.url,
+                    "",
                    link.title,
                    link.datetime,
-                    "",
                )
                logger.info("{} {} {} {}", article?.title, article?.createTime, article?.author, article?.text)
            }
@@ -54,26 +55,26 @@ class TestManagement {

    @Test
    fun testParseList() {
-        newsRunner
-            .parseArticleLink(
-                "cee7f242-668b-41fb-adbc-96fb27d4bf35",
-                "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
-                true,
-            )
+        parseArticleLink(
+            management,
+            "cee7f242-668b-41fb-adbc-96fb27d4bf35",
+            "https://zwfwj.beijing.gov.cn/so/ss/query/s?qt=(\"数据要素\")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20",
+            true,
+        )
            // .let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
            .let { it.forEach { logger.info("{}", it.url) } }
    }

    @Test
    fun testParseArticle() {
-        newsRunner
-            .parseArticle(
-                "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
-                "https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
-                "工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案（试行）》的通知",
-                "2024-11-01 12:48:26",
-                "",
-            )
+        parseArticle(
+            management,
+            "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
+            "https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
+            "",
+            "工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案（试行）》的通知",
+            "2024-11-01 12:48:26",
+        )
            ?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }
    }
 }
--- a/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt
+++ b/src/test/kotlin/com/lanyuanxiaoyao/digtal/market/TestRule.kt
@@ -14,8 +14,8 @@ class TestRule {
    private lateinit var management: Management

    private val link =
-        "https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html"
-    private val site = 北京市政务服务和数据管理局
+        "https://www.gov.cn/zhengce/202410/content_6979047.htm"
+    private val site = sites.find { it.code == "00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be" }!!

    @BeforeTest
    fun before() {
--- a/src/test/resources/test.http
+++ b/src/test/resources/test.http
@@ -70,4 +70,17 @@ Content-Type: application/json
 }

 ### Search
-POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
+POST https://zwfwj.beijing.gov.cn/so/ss/query/s?qt =("数据要素")&sort=dateDesc&siteCode=1100000248&tab=&page=1&pageSize=20
+
+### Search
+POST https://www.cantonde.com/si/common/searchInfo
+Content-Type: application/json
+
+{
+  "NAME": "数据要素",
+  "IN_CATEGORY": "",
+  "NOT_IN_CATEGORY": "",
+  "CATEGORY": "",
+  "pageNo": 1,
+  "pageSize": 10
+}