1
0

feat: 优化title和datetime从服务端进行处理

This commit is contained in:
2024-11-05 16:55:03 +08:00
parent bd67c9f830
commit 3175ab7a8c
2 changed files with 33 additions and 48 deletions

View File

@@ -17,18 +17,6 @@ private val iframe = Pair("iframe", "true")
private val post = Pair("post", "true")
private val form = Pair("form", "true")
private val timeScript = Script(
Script.Type.Javascript,
// language=javascript
script = "if (text && text !== '') {\n return text\n} else if(params['datetime']) {\n return params['datetime']\n} else {\n return text\n}",
)
private val titleScript = Script(
Script.Type.Javascript,
// language=javascript
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
)
// language=regexp
private val commonRemove = listOf(
"<.+?>",
@@ -128,7 +116,6 @@ private val 广东政务服务和数据管理局 = Site(
expression = "body",
title = Selector(".title-page .txt > span"),
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".content"),
extra = mapOf(
"source" to Selector(".content", properties = mapOf(html))
@@ -254,7 +241,7 @@ private val 中华人民共和国中央人民政府 = Site(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".content .article",
title = Selector("h1#ti", process = Process(script = listOf(titleScript))),
title = Selector("h1#ti"),
author = Selector(
".pages-date > .font",
process = Process(
@@ -262,11 +249,7 @@ private val 中华人民共和国中央人民政府 = Site(
remove = listOf("来源:")
)
),
dateTime = Selector(
".pages-date",
process = Process(script = listOf(timeScript)),
properties = mapOf("precision" to "true")
),
dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
@@ -278,9 +261,8 @@ private val 中华人民共和国中央人民政府 = Site(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".main-content",
title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))),
title = Selector(".qa_content_box"),
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("\\s*源:"))),
dateTime = Selector(process = Process(script = listOf(timeScript))),
content = Selector(".qa_content_text"),
extra = mapOf(
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
@@ -377,17 +359,6 @@ val 广州数据交易所 = Site(
expression = "$.data",
title = Selector("$.TITLE"),
content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
dateTime = Selector(
process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
"let datetimeText = params['datetime']\nif (datetimeText && datetimeText !== '') {\n let datetime = new Date(parseInt(params['datetime']))\n return `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`\n}\nreturn ''",
)
)
)
),
author = Selector(process = Process(default = "广州数据交易所"))
),
properties = mapOf(post),
@@ -425,13 +396,17 @@ val 北京市政务服务和数据管理局 = Site(
),
link = Selector("$.data.url")
),
next = Selector("$.totalHits", process = Process(script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
"if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}"
))
))
next = Selector(
"$.totalHits", process = Process(
script = listOf(
Script(
Script.Type.Javascript,
// language=javascript
"if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}"
)
)
)
)
),
// language=regexp
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(

View File

@@ -109,15 +109,25 @@ class NewsRunner : Runner {
}
(page["text"] as? Map<*, *>)?.let { text ->
val createTime = try {
DateUtil
.parse(
(text["datetime"] as String?)?.trim(),
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
var datetimeText = text["datetime"] as? String
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil.date(datetimeText.toLong()).toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
DateUtil.date(datetimeText.toLong() * 1000).toTimestamp()
} else {
DateUtil
.parse(
(text["datetime"] as String?)?.trim(),
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
"yyyy-MM-dd HH:mm",
)
.toTimestamp()
}
} catch (e: Exception) {
logger.error("Parse ${text["datetime"]} error", e)
null