feat: 优化title和datetime从服务端进行处理
This commit is contained in:
@@ -17,18 +17,6 @@ private val iframe = Pair("iframe", "true")
|
||||
private val post = Pair("post", "true")
|
||||
private val form = Pair("form", "true")
|
||||
|
||||
private val timeScript = Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
script = "if (text && text !== '') {\n return text\n} else if(params['datetime']) {\n return params['datetime']\n} else {\n return text\n}",
|
||||
)
|
||||
|
||||
private val titleScript = Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
script = "if (text && text !== '') {\n return text\n} else if(params['title']) {\n return params['title']\n} else {\n return text\n}",
|
||||
)
|
||||
|
||||
// language=regexp
|
||||
private val commonRemove = listOf(
|
||||
"<.+?>",
|
||||
@@ -128,7 +116,6 @@ private val 广东政务服务和数据管理局 = Site(
|
||||
expression = "body",
|
||||
title = Selector(".title-page .txt > span"),
|
||||
author = Selector(process = Process(default = "广东政务服务和数据管理局")),
|
||||
dateTime = Selector(process = Process(script = listOf(timeScript))),
|
||||
content = Selector(".content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".content", properties = mapOf(html))
|
||||
@@ -254,7 +241,7 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".content .article",
|
||||
title = Selector("h1#ti", process = Process(script = listOf(titleScript))),
|
||||
title = Selector("h1#ti"),
|
||||
author = Selector(
|
||||
".pages-date > .font",
|
||||
process = Process(
|
||||
@@ -262,11 +249,7 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
remove = listOf("来源:")
|
||||
)
|
||||
),
|
||||
dateTime = Selector(
|
||||
".pages-date",
|
||||
process = Process(script = listOf(timeScript)),
|
||||
properties = mapOf("precision" to "true")
|
||||
),
|
||||
dateTime = Selector(".pages-date", properties = mapOf("precision" to "true")),
|
||||
content = Selector(".pages_content"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
|
||||
@@ -278,9 +261,8 @@ private val 中华人民共和国中央人民政府 = Site(
|
||||
downloader = Downloader.Type.BROWSER,
|
||||
text = Content(
|
||||
expression = ".main-content",
|
||||
title = Selector(".qa_content_box", process = Process(script = listOf(titleScript))),
|
||||
title = Selector(".qa_content_box"),
|
||||
author = Selector(".qa-subtext-item:contains(来源)", process = Process(remove = listOf("来\\s*源:"))),
|
||||
dateTime = Selector(process = Process(script = listOf(timeScript))),
|
||||
content = Selector(".qa_content_text"),
|
||||
extra = mapOf(
|
||||
"source" to Selector(".main-content", properties = mapOf("html" to "true"))
|
||||
@@ -377,17 +359,6 @@ val 广州数据交易所 = Site(
|
||||
expression = "$.data",
|
||||
title = Selector("$.TITLE"),
|
||||
content = Selector("$.CONTENT", process = Process(remove = commonRemove)),
|
||||
dateTime = Selector(
|
||||
process = Process(
|
||||
script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"let datetimeText = params['datetime']\nif (datetimeText && datetimeText !== '') {\n let datetime = new Date(parseInt(params['datetime']))\n return `\${datetime.getFullYear()}-\${datetime.getMonth() + 1}-\${datetime.getDate()} \${datetime.getHours()}:\${datetime.getMinutes()}:\${datetime.getSeconds()}`\n}\nreturn ''",
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
author = Selector(process = Process(default = "广州数据交易所"))
|
||||
),
|
||||
properties = mapOf(post),
|
||||
@@ -425,13 +396,17 @@ val 北京市政务服务和数据管理局 = Site(
|
||||
),
|
||||
link = Selector("$.data.url")
|
||||
),
|
||||
next = Selector("$.totalHits", process = Process(script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}"
|
||||
))
|
||||
))
|
||||
next = Selector(
|
||||
"$.totalHits", process = Process(
|
||||
script = listOf(
|
||||
Script(
|
||||
Script.Type.Javascript,
|
||||
// language=javascript
|
||||
"if (text && text !== '') {\n let total = parseInt(text)\n let current = parseInt(params['page'] ?? '1')\n let size = parseInt(params['pageSize'] ?? 20)\n if (current * size >= total) {\n return ''\n }\n return params['url'].replace(/page=\\d+/, 'page=' + (current + 1))\n} else {\n return ''\n}"
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
// language=regexp
|
||||
"https://zwfwj\\.beijing\\.gov\\.cn/.+t\\d+_\\d+.html" to Rule(
|
||||
|
||||
@@ -109,15 +109,25 @@ class NewsRunner : Runner {
|
||||
}
|
||||
(page["text"] as? Map<*, *>)?.let { text ->
|
||||
val createTime = try {
|
||||
DateUtil
|
||||
.parse(
|
||||
(text["datetime"] as String?)?.trim(),
|
||||
"yyyy-MM-dd HH:mm:ss",
|
||||
"yyyy-MM-dd",
|
||||
"yyyy年MM月dd日 HH:mm",
|
||||
"yyyy-MM-dd HH:mm",
|
||||
)
|
||||
.toTimestamp()
|
||||
var datetimeText = text["datetime"] as? String
|
||||
if (datetimeText.isNullOrBlank()) {
|
||||
datetimeText = datetime
|
||||
}
|
||||
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
|
||||
DateUtil.date(datetimeText.toLong()).toTimestamp()
|
||||
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
|
||||
DateUtil.date(datetimeText.toLong() * 1000).toTimestamp()
|
||||
} else {
|
||||
DateUtil
|
||||
.parse(
|
||||
(text["datetime"] as String?)?.trim(),
|
||||
"yyyy-MM-dd HH:mm:ss",
|
||||
"yyyy-MM-dd",
|
||||
"yyyy年MM月dd日 HH:mm",
|
||||
"yyyy-MM-dd HH:mm",
|
||||
)
|
||||
.toTimestamp()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
logger.error("Parse ${text["datetime"]} error", e)
|
||||
null
|
||||
|
||||
Reference in New Issue
Block a user