1
0

fix: 修复zhengceku部份网站无法解析

This commit is contained in:
2024-11-05 17:49:51 +08:00
parent 3175ab7a8c
commit 697460ed98
3 changed files with 27 additions and 16 deletions

View File

@@ -237,6 +237,18 @@ private val 中华人民共和国中央人民政府 = Site(
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/(zhengceku)/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(
expression = ".policyLibraryOverview_content",
author = Selector("td:contains(源:) + td"),
content = Selector(".pages_content"),
extra = mapOf(
"source" to Selector(".pages_content", properties = mapOf("html" to "true"))
)
)
),
// language=regexp
"https*://www\\.gov\\.cn/.+/content_\\d+\\.html*" to Rule(
downloader = Downloader.Type.BROWSER,
text = Content(

View File

@@ -96,31 +96,30 @@ class NewsRunner : Runner {
Failsafe
.with(retryPolicy)
.get(CheckedSupplier {
runBlocking {
management.parse(code, url, properties = mutableMapOf<String, String>().apply {
title?.let { put("title", title) }
datetime?.let { put("datetime", datetime) }
})
}
runBlocking { management.parse(code, url) }
})
} catch (e: FailsafeException) {
logger.error("Parse failure", e)
return null
}
(page["text"] as? Map<*, *>)?.let { text ->
var datetimeText = text["datetime"] as? String
val createTime = try {
var datetimeText = text["datetime"] as? String
if (datetimeText.isNullOrBlank()) {
datetimeText = datetime
datetimeText = datetime
}
if (datetimeText?.matches(Regex("\\d{10}")) == true) {
DateUtil.date(datetimeText.toLong()).toTimestamp()
DateUtil
.date(datetimeText.toLong())
.toTimestamp()
} else if (datetimeText?.matches(Regex("\\d{7}")) == true) {
DateUtil.date(datetimeText.toLong() * 1000).toTimestamp()
DateUtil
.date(datetimeText.toLong() * 1000)
.toTimestamp()
} else {
DateUtil
.parse(
(text["datetime"] as String?)?.trim(),
datetimeText,
"yyyy-MM-dd HH:mm:ss",
"yyyy-MM-dd",
"yyyy年MM月dd日 HH:mm",
@@ -129,7 +128,7 @@ class NewsRunner : Runner {
.toTimestamp()
}
} catch (e: Exception) {
logger.error("Parse ${text["datetime"]} error", e)
logger.error("Parse $datetimeText error", e)
null
}
val source = text["source"]?.let {

View File

@@ -68,10 +68,10 @@ class TestManagement {
fun testParseArticle() {
newsRunner
.parseArticle(
"74ee6b33-c1a3-41f9-b947-acd0bebd0e6e",
"https://zwfwj.beijing.gov.cn/zwgk/2024zcwj/202409/t20240927_3908531.html",
"no title",
"no datetime",
"00bfe1f5-7662-4ea5-ada3-6e9dfc19f0be",
"https://www.gov.cn/zhengce/zhengceku/202411/content_6984322.htm",
"工业和信息化部关于印发《工业和信息化领域数据安全事件应急预案(试行)》的通知",
"2024-11-01 12:48:26",
"",
)
?.let { logger.info("{}", JSONUtil.toJsonPrettyStr(it)) }