From 3175ab7a8c55ef75adf3c12f240c8e93714defd4 Mon Sep 17 00:00:00 2001 From: lanyuanxiaoyao Date: Tue, 5 Nov 2024 16:55:03 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96title=E5=92=8Cdatetim?= =?UTF-8?q?e=E4=BB=8E=E6=9C=8D=E5=8A=A1=E7=AB=AF=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/lanyuanxiaoyao/digtal/market/Sites.kt | 53 +++++-------------- .../digtal/market/runner/NewsRunner.kt | 28 ++++++---- 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt index 996c3a6..15d431e 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/Sites.kt @@ -17,18 +17,6 @@ private val iframe = Pair("iframe", "true") private val post = Pair("post", "true") private val form = Pair("form", "true") -private val timeScript = Script( - Script.Type.Javascript, - // language=javascript - script = "if (text && text !== '') {\n return text\n} else if(params['datetime']) {\n return params['datetime']\n} else {\n return text\n}", -) - -private val titleScript = Script( - Script.Type.Javascript, - // language=javascript - script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}", -) - // language=regexp private val commonRemove = listOf( "<.+?>", @@ -128,7 +116,6 @@ private val 广东政务服务和数据管理局 = Site( expression = "body", title = Selector(".title-page .txt > span"), author = Selector(process = Process(default = "广东政务服务和数据管理局")), - dateTime = Selector(process = Process(script = listOf(timeScript))), content = Selector(".content"), extra = mapOf( "source" to Selector(".content", properties = mapOf(html)) @@ -254,7 +241,7 @@ private val 中华人民共和国中央人民政府 = Site( downloader = Downloader.Type.BROWSER, text = Content( expression = ".content .article", - title = Selector("h1#ti", process = Process(script = listOf(titleScript))), + title = Selector("h1#ti"), author = Selector( ".pages-date > .font", process = Process( @@ -262,11 +249,7 @@ private val 中华人民共和国中央人民政府 = Site( remove = listOf("来源:") ) ), - dateTime = Selector( - ".pages-date", - process = Process(script = listOf(timeScript)), - properties = mapOf("precision" to "true") - ), + dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")), content = Selector(".pages_content"), extra = mapOf( "source" to Selector(".pages_content", properties = mapOf("html" to "true")) @@ -278,9 +261,8 @@ private val 中华人民共和国中央人民政府 = Site( downloader = Downloader.Type.BROWSER, text = Content( expression = ".main-content", - title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))), + title = Selector(".qa_content_box"), author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))), - dateTime = Selector(process = Process(script = listOf(timeScript))), content = Selector(".qa_content_text"), extra = mapOf( "source" to Selector(".main-content", properties = mapOf("html" to "true")) @@ -377,17 +359,6 @@ val 广州数据交易所 = Site( expression = "$.data", title = Selector("$.TITLE"), content = Selector("$.CONTENT", process = Process(remove = commonRemove)), - dateTime = Selector( - process = Process( - script = listOf( - Script( - Script.Type.Javascript, - // language=javascript - "let datetimeText = params['datetime']\nif (datetimeText && datetimeText !== '') {\n let datetime = new Date(parseInt(params['datetime']))\n return `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`\n}\nreturn ''", - ) - ) - ) - ), author = Selector(process = Process(default = "广州数据交易所")) ), properties = mapOf(post), @@ -425,13 +396,17 @@ val 北京市政务服务和数据管理局 = Site( ), link = Selector("$.data.url") ), - next = Selector("$.totalHits", process = Process(script = listOf( - Script( - Script.Type.Javascript, - // language=javascript - "if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}" - )) - )) + next = Selector( + "$.totalHits", process = Process( + script = listOf( + Script( + Script.Type.Javascript, + // language=javascript + "if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}" + ) + ) + ) + ) ), // language=regexp "https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule( diff --git a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt index 4fb9872..7e606d0 100644 --- a/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt +++ b/src/main/kotlin/com/lanyuanxiaoyao/digtal/market/runner/NewsRunner.kt @@ -109,15 +109,25 @@ class NewsRunner : Runner { } (page["text"] as? Map<*, *>)?.let { text -> val createTime = try { - DateUtil - .parse( - (text["datetime"] as String?)?.trim(), - "yyyy-MM-dd HH:mm:ss", - "yyyy-MM-dd", - "yyyy年MM月dd日 HH:mm", - "yyyy-MM-dd HH:mm", - ) - .toTimestamp() + var datetimeText = text["datetime"] as? String + if (datetimeText.isNullOrBlank()) { + datetimeText = datetime + } + if (datetimeText?.matches(Regex("\\d{10}")) == true) { + DateUtil.date(datetimeText.toLong()).toTimestamp() + } else if (datetimeText?.matches(Regex("\\d{7}")) == true) { + DateUtil.date(datetimeText.toLong() * 1000).toTimestamp() + } else { + DateUtil + .parse( + (text["datetime"] as String?)?.trim(), + "yyyy-MM-dd HH:mm:ss", + "yyyy-MM-dd", + "yyyy年MM月dd日 HH:mm", + "yyyy-MM-dd HH:mm", + ) + .toTimestamp() + } } catch (e: Exception) { logger.error("Parse ${text["datetime"]} error", e) null