Files
bilinovel-downloader/downloader/bilinovel/bilinovel.go
nite 6084386989 refactor(bilinovel): Migrate browser automation from Chromedp to Playwright
This commit replaces the `chromedp` library with `playwright-go` for browser automation within the Bilinovel downloader.

Changes include:
*   Updated `Bilinovel` struct to manage Playwright browser, context, and page instances.
*   Rewrote `initBrowser` and `Close` methods to use Playwright's API for browser lifecycle management.
*   Refactored `processContentWithChromedp` to `processContentWithPlaywright`, adapting the logic to use Playwright's page evaluation capabilities.
*   Removed unused `context` and `time` imports.
*   Added HTML cleanup in `getChapterByPage` to remove `class` attributes from images and `data-k` attributes from all elements, improving content consistency.
2025-10-06 07:58:31 +11:00

523 lines
14 KiB
Go

package bilinovel
import (
"bilinovel-downloader/model"
"bilinovel-downloader/utils"
"bytes"
"crypto/sha256"
_ "embed"
"fmt"
"log"
"net/http"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
mapper "github.com/bestnite/font-mapper"
"github.com/playwright-community/playwright-go"
)
//go:embed read.ttf
var readTTF []byte
//go:embed "MI LANTING.ttf"
var miLantingTTF []byte
type Bilinovel struct {
fontMapper *mapper.GlyphOutlineMapper
textOnly bool
restyClient *utils.RestyClient
// 浏览器实例复用
browser playwright.Browser
browserContext playwright.BrowserContext
page playwright.Page
}
func New() (*Bilinovel, error) {
fontMapper, err := mapper.NewGlyphOutlineMapper(readTTF, miLantingTTF)
if err != nil {
return nil, fmt.Errorf("failed to create font mapper: %v", err)
}
restyClient := utils.NewRestyClient(50)
b := &Bilinovel{
fontMapper: fontMapper,
textOnly: false,
restyClient: restyClient,
}
// 初始化浏览器实例
err = b.initBrowser()
if err != nil {
return nil, fmt.Errorf("failed to init browser: %v", err)
}
return b, nil
}
func (b *Bilinovel) SetTextOnly(textOnly bool) {
b.textOnly = textOnly
}
func (b *Bilinovel) GetExtraFiles() []model.ExtraFile {
return nil
}
// initBrowser 初始化浏览器实例
func (b *Bilinovel) initBrowser() error {
pw, err := playwright.Run()
if err != nil {
return fmt.Errorf("could not start playwright: %w", err)
}
b.browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(false),
})
if err != nil {
return fmt.Errorf("could not launch browser: %w", err)
}
b.browserContext, err = b.browser.NewContext()
if err != nil {
return fmt.Errorf("could not create browser context: %w", err)
}
b.page, err = b.browserContext.NewPage()
if err != nil {
return fmt.Errorf("could not create page: %w", err)
}
log.Println("Browser initialized successfully")
return nil
}
// Close 清理资源
func (b *Bilinovel) Close() error {
if b.browser != nil {
if err := b.browser.Close(); err != nil {
log.Printf("could not close browser: %v", err)
}
b.browser = nil
b.browserContext = nil
b.page = nil
}
return nil
}
//go:embed style.css
var styleCSS []byte
func (b *Bilinovel) GetStyleCSS() string {
return string(styleCSS)
}
func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error) {
log.Printf("Getting novel %v\n", novelId)
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v.html", novelId)
resp, err := b.restyClient.R().Get(novelUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
novel := &model.Novel{}
novel.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
novel.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
novel.Id = novelId
doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
})
volumes, err := b.getAllVolumes(novelId, skipChapter)
if err != nil {
return nil, fmt.Errorf("failed to get novel volumes: %v", err)
}
novel.Volumes = volumes
return novel, nil
}
func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*model.Volume, error) {
log.Printf("Getting volume %v of novel %v\n", volumeId, novelId)
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
resp, err := b.restyClient.R().Get(novelUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
seriesIdx := 0
doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
if s.AttrOr("href", "") == fmt.Sprintf("/novel/%v/vol_%v.html", novelId, volumeId) {
seriesIdx = i + 1
}
})
novelTitle := strings.TrimSpace(doc.Find(".book-title").First().Text())
if seriesIdx == 0 {
return nil, fmt.Errorf("volume not found: %v", volumeId)
}
volumeUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/vol_%v.html", novelId, volumeId)
resp, err = b.restyClient.R().Get(volumeUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %v", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err = goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
volume := &model.Volume{}
volume.NovelId = novelId
volume.NovelTitle = novelTitle
volume.Id = volumeId
volume.SeriesIdx = seriesIdx
volume.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
volume.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
volume.Url = volumeUrl
volume.Chapters = make([]*model.Chapter, 0)
volume.CoverUrl = doc.Find(".book-cover").First().AttrOr("src", "")
cover, err := b.getImg(volume.CoverUrl)
if err != nil {
return nil, fmt.Errorf("failed to get cover: %v", err)
}
volume.Cover = cover
doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".chapter-li.jsChapter").Each(func(i int, s *goquery.Selection) {
volume.Chapters = append(volume.Chapters, &model.Chapter{
Title: s.Find("a").Text(),
Url: fmt.Sprintf("https://www.bilinovel.com%v", s.Find("a").AttrOr("href", "")),
})
})
idRegexp := regexp.MustCompile(`/novel/(\d+)/(\d+).html`)
if !skipChapter {
for i := range volume.Chapters {
matches := idRegexp.FindStringSubmatch(volume.Chapters[i].Url)
if len(matches) > 0 {
chapterId, err := strconv.Atoi(matches[2])
if err != nil {
return nil, fmt.Errorf("failed to convert chapter id: %v", err)
}
chapter, err := b.GetChapter(novelId, volumeId, chapterId)
if err != nil {
return nil, fmt.Errorf("failed to get chapter: %v", err)
}
chapter.Id = chapterId
volume.Chapters[i] = chapter
} else {
return nil, fmt.Errorf("failed to get chapter id: %v", volume.Chapters[i].Url)
}
}
}
return volume, nil
}
func (b *Bilinovel) getAllVolumes(novelId int, skipChapter bool) ([]*model.Volume, error) {
log.Printf("Getting all volumes of novel %v\n", novelId)
catelogUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
resp, err := b.restyClient.R().Get(catelogUrl)
if err != nil {
return nil, fmt.Errorf("failed to get catelog: %v", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get catelog: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
volumeRegexp := regexp.MustCompile(fmt.Sprintf(`/novel/%v/vol_(\d+).html`, novelId))
volumeIds := make([]string, 0)
doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
link := s.AttrOr("href", "")
matches := volumeRegexp.FindStringSubmatch(link)
if len(matches) > 0 {
volumeIds = append(volumeIds, matches[1])
}
})
volumes := make([]*model.Volume, 0)
for i, volumeIdStr := range volumeIds {
volumeId, err := strconv.Atoi(volumeIdStr)
if err != nil {
return nil, fmt.Errorf("failed to convert volume id: %v", err)
}
volume, err := b.GetVolume(novelId, volumeId, skipChapter)
if err != nil {
return nil, fmt.Errorf("failed to get volume info: %v", err)
}
volume.SeriesIdx = i
volumes = append(volumes, volume)
}
return volumes, nil
}
func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) {
log.Printf("Getting chapter %v of novel %v\n", chapterId, novelId)
page := 1
chapter := &model.Chapter{
Id: chapterId,
NovelId: novelId,
VolumeId: volumeId,
Url: fmt.Sprintf("https://www.bilinovel.com/novel/%v/%v.html", novelId, chapterId),
}
for {
hasNext, err := b.getChapterByPage(chapter, page)
if err != nil {
return nil, fmt.Errorf("failed to download chapter: %w", err)
}
if !hasNext {
break
}
page++
}
return chapter, nil
}
func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, error) {
log.Printf("Getting chapter %v by page %v\n", chapter.Id, page)
Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", page)
hasNext := false
headers := map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,zh-TW;q=0.6",
"Cookie": "night=1;",
}
resp, err := b.restyClient.R().SetHeaders(headers).Get(Url)
if err != nil {
return false, fmt.Errorf("failed to get chapter: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return false, fmt.Errorf("failed to get chapter: %v", resp.Status())
}
if strings.Contains(resp.String(), `<a onclick="window.location.href = ReadParams.url_next;">下一頁</a>`) {
hasNext = true
}
html := resp.Body()
// 解决乱序问题
resortedHtml, err := b.processContentWithPlaywright(string(html))
if err != nil {
return false, fmt.Errorf("failed to process html: %w", err)
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(resortedHtml))
if err != nil {
return false, fmt.Errorf("failed to parse html: %w", err)
}
if page == 1 {
chapter.Title = doc.Find("#atitle").Text()
}
content := doc.Find("#acontent").First()
content.Find(".cgo").Remove()
content.Find("center").Remove()
content.Find(".google-auto-placed").Remove()
if strings.Contains(resp.String(), `font-family: "read"`) {
html, err := content.Find("p").Last().Html()
if err != nil {
return false, fmt.Errorf("failed to get html: %v", err)
}
builder := strings.Builder{}
for _, r := range html {
_, newRune, ok := b.fontMapper.MappingRune(r)
if ok {
builder.WriteRune(newRune)
}
}
content.Find("p").Last().SetHtml(builder.String())
}
if b.textOnly {
content.Find("img").Remove()
} else {
content.Find("img").Each(func(i int, s *goquery.Selection) {
imgUrl := s.AttrOr("data-src", "")
if imgUrl == "" {
imgUrl = s.AttrOr("src", "")
if imgUrl == "" {
return
}
}
imageHash := sha256.Sum256([]byte(imgUrl))
imageFilename := fmt.Sprintf("%x%s", string(imageHash[:]), path.Ext(imgUrl))
s.SetAttr("src", imageFilename)
s.SetAttr("alt", imgUrl)
s.RemoveAttr("class")
img, err := b.getImg(imgUrl)
if err != nil {
return
}
if chapter.Content == nil {
chapter.Content = &model.ChaperContent{}
}
if chapter.Content.Images == nil {
chapter.Content.Images = make(map[string][]byte)
}
chapter.Content.Images[imageFilename] = img
})
}
doc.Find("*").Each(func(i int, s *goquery.Selection) {
if len(s.Nodes) > 0 && len(s.Nodes[0].Attr) > 0 {
// 遍历元素的所有属性
for _, attr := range s.Nodes[0].Attr {
// 3. 检查属性名是否以 "data-k" 开头,且属性值是否为空
if strings.HasPrefix(attr.Key, "data-k") {
// 4. 如果满足条件,就移除这个属性
s.RemoveAttr(attr.Key)
}
}
}
})
htmlStr, err := content.Html()
if err != nil {
return false, fmt.Errorf("failed to get html: %v", err)
}
if chapter.Content == nil {
chapter.Content = &model.ChaperContent{}
}
chapter.Content.Html += strings.TrimSpace(htmlStr)
return hasNext, nil
}
func (b *Bilinovel) getImg(url string) ([]byte, error) {
log.Printf("Getting img %v\n", url)
resp, err := b.restyClient.R().SetHeader("Referer", "https://www.bilinovel.com").Get(url)
if err != nil {
return nil, err
}
return resp.Body(), nil
}
// processContentWithPlaywright 使用复用的浏览器实例处理内容
func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, error) {
tempFile, err := os.CreateTemp("", "bilinovel-temp-*.html")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
_, err = tempFile.WriteString(htmlContent)
if err != nil {
return "", fmt.Errorf("failed to write temp file: %w", err)
}
tempFile.Close()
tempFilePath := tempFile.Name()
_, err = b.page.ExpectResponse(func(url string) bool {
return strings.Contains(url, "chapterlog.js")
}, func() error {
_, err = b.page.Goto("file://" + filepath.ToSlash(tempFilePath))
if err != nil {
return fmt.Errorf("could not navigate to file: %w", err)
}
return nil
}, playwright.PageExpectResponseOptions{
Timeout: playwright.Float(5000),
})
if err != nil {
return "", fmt.Errorf("failed to wait for network request finish")
}
err = b.page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{
State: playwright.WaitForSelectorStateVisible,
})
if err != nil {
return "", fmt.Errorf("could not wait for #acontent: %w", err)
}
// 遍历所有 #acontent 的子元素, 通过 window.getComputedStyle().display 检测是否是 none, 如果是 none 则从页面删除这个元素
result, err := b.page.Evaluate(`
(function() {
const acontent = document.getElementById('acontent');
if (!acontent) {
return 'acontent element not found';
}
let removedCount = 0;
const elements = acontent.querySelectorAll('*');
// 从后往前遍历,避免删除元素时影响索引
for (let i = elements.length - 1; i >= 0; i--) {
const element = elements[i];
const computedStyle = window.getComputedStyle(element);
if (computedStyle.display === 'none') {
element.remove();
removedCount++;
}
}
return 'Removed ' + removedCount + ' hidden elements';
})()
`)
if err != nil {
return "", fmt.Errorf("failed to remove hidden elements: %w", err)
}
log.Printf("Hidden elements removal result: %s", result)
processedHTML, err := b.page.Content()
if err != nil {
return "", fmt.Errorf("could not get page content: %w", err)
}
return processedHTML, nil
}