Files
423down-proxy/article.go
T
2026-05-12 07:58:30 +00:00

78 lines
1.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"github.com/PuerkitoBio/goquery"
"resty.dev/v3"
)
var (
errArticleContentNotFound = errors.New("无法找到文章内容")
errArticleContentParseFailed = errors.New("无法解析文章内容")
)
// getArticleHTML 优先从缓存读取文章 HTML,缓存未命中时从源站抓取。
func getArticleHTML(db *DB, client *resty.Client, articleID string) (string, error) {
html, err := db.GetArticleHtml(articleID)
if err == nil {
return html, nil
}
cookies, err := getLoginCookies(db, client)
if err != nil {
return "", err
}
html, err = fetchRemoteArticleHTML(client, articleID, cookies)
if err != nil {
return "", err
}
if err := db.SetArticleHtml(articleID, html, articleCacheTTL); err != nil {
slog.Warn("failed to cache article", slog.String("article_id", articleID), slog.String("error", err.Error()))
}
return html, nil
}
// fetchRemoteArticleHTML 从源站下载文章页面并提取正文 HTML。
func fetchRemoteArticleHTML(client *resty.Client, articleID string, cookies []*http.Cookie) (string, error) {
request := client.R()
if len(cookies) > 0 {
request.SetCookies(cookies)
}
resp, err := request.Get(fmt.Sprintf(articleURLFormat, articleID))
if err != nil {
return "", err
}
defer func() { _ = resp.Body.Close() }()
return extractArticleHTML(resp.Body)
}
// extractArticleHTML 从源站页面中提取文章正文区域。
func extractArticleHTML(reader io.Reader) (string, error) {
doc, err := goquery.NewDocumentFromReader(reader)
if err != nil {
return "", err
}
selection := doc.Find("div.entry")
if selection.Length() == 0 {
return "", errArticleContentNotFound
}
html, err := selection.Html()
if err != nil {
return "", fmt.Errorf("%w%v", errArticleContentParseFailed, err)
}
return html, nil
}