78 lines
1.8 KiB
Go
78 lines
1.8 KiB
Go
package main
|
||
|
||
import (
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"log/slog"
|
||
"net/http"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
"resty.dev/v3"
|
||
)
|
||
|
||
var (
|
||
errArticleContentNotFound = errors.New("无法找到文章内容")
|
||
errArticleContentParseFailed = errors.New("无法解析文章内容")
|
||
)
|
||
|
||
// getArticleHTML 优先从缓存读取文章 HTML,缓存未命中时从源站抓取。
|
||
func getArticleHTML(db *DB, client *resty.Client, articleID string) (string, error) {
|
||
html, err := db.GetArticleHtml(articleID)
|
||
if err == nil {
|
||
return html, nil
|
||
}
|
||
|
||
cookies, err := getLoginCookies(db, client)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
html, err = fetchRemoteArticleHTML(client, articleID, cookies)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
if err := db.SetArticleHtml(articleID, html, articleCacheTTL); err != nil {
|
||
slog.Warn("failed to cache article", slog.String("article_id", articleID), slog.String("error", err.Error()))
|
||
}
|
||
|
||
return html, nil
|
||
}
|
||
|
||
// fetchRemoteArticleHTML 从源站下载文章页面并提取正文 HTML。
|
||
func fetchRemoteArticleHTML(client *resty.Client, articleID string, cookies []*http.Cookie) (string, error) {
|
||
request := client.R()
|
||
if len(cookies) > 0 {
|
||
request.SetCookies(cookies)
|
||
}
|
||
|
||
resp, err := request.Get(fmt.Sprintf(articleURLFormat, articleID))
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
defer func() { _ = resp.Body.Close() }()
|
||
|
||
return extractArticleHTML(resp.Body)
|
||
}
|
||
|
||
// extractArticleHTML 从源站页面中提取文章正文区域。
|
||
func extractArticleHTML(reader io.Reader) (string, error) {
|
||
doc, err := goquery.NewDocumentFromReader(reader)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
selection := doc.Find("div.entry")
|
||
if selection.Length() == 0 {
|
||
return "", errArticleContentNotFound
|
||
}
|
||
|
||
html, err := selection.Html()
|
||
if err != nil {
|
||
return "", fmt.Errorf("%w:%v", errArticleContentParseFailed, err)
|
||
}
|
||
|
||
return html, nil
|
||
}
|