package main import ( "errors" "fmt" "io" "log/slog" "net/http" "github.com/PuerkitoBio/goquery" "resty.dev/v3" ) var ( errArticleContentNotFound = errors.New("无法找到文章内容") errArticleContentParseFailed = errors.New("无法解析文章内容") ) // getArticleHTML 优先从缓存读取文章 HTML,缓存未命中时从源站抓取。 func getArticleHTML(db *DB, client *resty.Client, articleID string) (string, error) { html, err := db.GetArticleHtml(articleID) if err == nil { return html, nil } cookies, err := getLoginCookies(db, client) if err != nil { return "", err } html, err = fetchRemoteArticleHTML(client, articleID, cookies) if err != nil { return "", err } if err := db.SetArticleHtml(articleID, html, articleCacheTTL); err != nil { slog.Warn("failed to cache article", slog.String("article_id", articleID), slog.String("error", err.Error())) } return html, nil } // fetchRemoteArticleHTML 从源站下载文章页面并提取正文 HTML。 func fetchRemoteArticleHTML(client *resty.Client, articleID string, cookies []*http.Cookie) (string, error) { request := client.R() if len(cookies) > 0 { request.SetCookies(cookies) } resp, err := request.Get(fmt.Sprintf(articleURLFormat, articleID)) if err != nil { return "", err } defer func() { _ = resp.Body.Close() }() return extractArticleHTML(resp.Body) } // extractArticleHTML 从源站页面中提取文章正文区域。 func extractArticleHTML(reader io.Reader) (string, error) { doc, err := goquery.NewDocumentFromReader(reader) if err != nil { return "", err } selection := doc.Find("div.entry") if selection.Length() == 0 { return "", errArticleContentNotFound } html, err := selection.Html() if err != nil { return "", fmt.Errorf("%w:%v", errArticleContentParseFailed, err) } return html, nil }