Files
bilinovel-downloader/downloader/bilinovel/bilinovel.go
nite 17c3859e9e feat(logging): Implement structured logging and debug mode
fix: Windows cannot download novels correctly
2025-10-17 01:36:23 +11:00

641 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package bilinovel
import (
"bilinovel-downloader/model"
"bilinovel-downloader/utils"
"bytes"
"crypto/sha256"
_ "embed"
"fmt"
"log/slog"
"net/http"
"os"
"path"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
mapper "github.com/bestnite/font-mapper"
"github.com/playwright-community/playwright-go"
)
//go:embed read.ttf
var readTTF []byte
//go:embed "MI LANTING.ttf"
var miLantingTTF []byte
type Bilinovel struct {
fontMapper *mapper.GlyphOutlineMapper
textOnly bool
restyClient *utils.RestyClient
// 浏览器实例复用
browser playwright.Browser
browserContext playwright.BrowserContext
pages map[string]playwright.Page
concurrency int
concurrentChan chan any
logger *slog.Logger
}
type BilinovelNewOption struct {
Concurrency int
Debug bool
}
func New(option BilinovelNewOption) (*Bilinovel, error) {
fontMapper, err := mapper.NewGlyphOutlineMapper(readTTF, miLantingTTF)
if err != nil {
return nil, fmt.Errorf("failed to create font mapper: %v", err)
}
restyClient := utils.NewRestyClient(50)
var logLevel slog.Level
if option.Debug {
logLevel = slog.LevelDebug
} else {
logLevel = slog.LevelInfo
}
handlerOptions := &slog.HandlerOptions{
Level: logLevel,
}
b := &Bilinovel{
fontMapper: fontMapper,
textOnly: false,
restyClient: restyClient,
pages: make(map[string]playwright.Page),
concurrency: option.Concurrency,
concurrentChan: make(chan any, option.Concurrency),
logger: slog.New(slog.NewTextHandler(os.Stdout, handlerOptions)),
}
// 初始化浏览器实例
err = b.initBrowser(option.Debug)
if err != nil {
return nil, fmt.Errorf("failed to init browser: %v", err)
}
return b, nil
}
func (b *Bilinovel) SetTextOnly(textOnly bool) {
b.textOnly = textOnly
}
func (b *Bilinovel) GetExtraFiles() []model.ExtraFile {
return nil
}
// initBrowser 初始化浏览器实例
func (b *Bilinovel) initBrowser(debug bool) error {
pw, err := playwright.Run()
if err != nil {
return fmt.Errorf("could not start playwright: %w", err)
}
b.browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(!debug),
Devtools: playwright.Bool(debug),
})
if err != nil {
return fmt.Errorf("could not launch browser: %w", err)
}
b.browserContext, err = b.browser.NewContext()
if err != nil {
return fmt.Errorf("could not create browser context: %w", err)
}
b.logger.Info("Browser initialized successfully")
return nil
}
// Close 清理资源
func (b *Bilinovel) Close() error {
if b.browser != nil {
if err := b.browser.Close(); err != nil {
b.logger.Error("could not close browser", slog.Any("error", err))
}
b.browser = nil
b.browserContext = nil
}
return nil
}
//go:embed style.css
var styleCSS []byte
func (b *Bilinovel) GetStyleCSS() string {
return string(styleCSS)
}
func (b *Bilinovel) GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error) {
b.logger.Info("Getting novel", slog.Int("novelId", novelId))
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v.html", novelId)
resp, err := b.restyClient.R().Get(novelUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
novel := &model.Novel{}
novel.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
novel.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
novel.Id = novelId
doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
})
volumes, err := b.getAllVolumes(novelId, skipChapterContent, skipVolumes)
if err != nil {
return nil, fmt.Errorf("failed to get novel volumes: %v", err)
}
novel.Volumes = volumes
return novel, nil
}
func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error) {
b.logger.Info("Getting volume of novel", slog.Int("volumeId", volumeId), slog.Int("novelId", novelId))
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
resp, err := b.restyClient.R().Get(novelUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
seriesIdx := 0
doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
if s.AttrOr("href", "") == fmt.Sprintf("/novel/%v/vol_%v.html", novelId, volumeId) {
seriesIdx = i + 1
}
})
novelTitle := strings.TrimSpace(doc.Find(".book-title").First().Text())
if seriesIdx == 0 {
return nil, fmt.Errorf("volume not found: %v", volumeId)
}
volumeUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/vol_%v.html", novelId, volumeId)
resp, err = b.restyClient.R().Get(volumeUrl)
if err != nil {
return nil, fmt.Errorf("failed to get novel info: %v", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
}
doc, err = goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
volume := &model.Volume{}
volume.NovelId = novelId
volume.NovelTitle = novelTitle
volume.Id = volumeId
volume.SeriesIdx = seriesIdx
volume.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
volume.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
volume.Url = volumeUrl
volume.Chapters = make([]*model.Chapter, 0)
volume.CoverUrl = doc.Find(".book-cover").First().AttrOr("src", "")
cover, err := b.getImg(volume.CoverUrl)
if err != nil {
return nil, fmt.Errorf("failed to get cover: %v", err)
}
volume.Cover = cover
doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
})
doc.Find(".chapter-li.jsChapter").Each(func(i int, s *goquery.Selection) {
volume.Chapters = append(volume.Chapters, &model.Chapter{
Title: s.Find("a").Text(),
Url: fmt.Sprintf("https://www.bilinovel.com%v", s.Find("a").AttrOr("href", "")),
})
})
idRegexp := regexp.MustCompile(`/novel/(\d+)/(\d+).html`)
if !skipChapterContent {
for i := range volume.Chapters {
matches := idRegexp.FindStringSubmatch(volume.Chapters[i].Url)
if len(matches) > 0 {
chapterId, err := strconv.Atoi(matches[2])
if err != nil {
return nil, fmt.Errorf("failed to convert chapter id: %v", err)
}
chapter, err := b.GetChapter(novelId, volumeId, chapterId)
if err != nil {
return nil, fmt.Errorf("failed to get chapter: %v", err)
}
chapter.Id = chapterId
volume.Chapters[i] = chapter
} else {
return nil, fmt.Errorf("failed to get chapter id: %v", volume.Chapters[i].Url)
}
}
}
return volume, nil
}
func (b *Bilinovel) getAllVolumes(novelId int, skipChapterContent bool, skipVolumes []int) ([]*model.Volume, error) {
b.logger.Info("Getting all volumes of novel", slog.Int("novelId", novelId))
catelogUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
resp, err := b.restyClient.R().Get(catelogUrl)
if err != nil {
return nil, fmt.Errorf("failed to get catelog: %v", err)
}
if resp.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("failed to get catelog: %v", resp.Status())
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
if err != nil {
return nil, fmt.Errorf("failed to parse html: %v", err)
}
volumeRegexp := regexp.MustCompile(fmt.Sprintf(`/novel/%v/vol_(\d+).html`, novelId))
volumeIds := make([]string, 0)
doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
link := s.AttrOr("href", "")
matches := volumeRegexp.FindStringSubmatch(link)
if len(matches) > 0 {
volumeIds = append(volumeIds, matches[1])
}
})
volumes := make([]*model.Volume, len(volumeIds))
var wg sync.WaitGroup
var mu sync.Mutex // 保护 volumes 写入的互斥锁
for i, volumeIdStr := range volumeIds {
wg.Add(1)
b.concurrentChan <- struct{}{} // 获取一个并发槽
go func(i int, volumeIdStr string) {
defer wg.Done()
defer func() { <-b.concurrentChan }() // 释放并发槽
volumeId, err := strconv.Atoi(volumeIdStr)
if err != nil {
b.logger.Error("failed to convert volume id", slog.String("volumeIdStr", volumeIdStr), slog.Any("error", err))
return
}
if slices.Contains(skipVolumes, volumeId) {
return
}
volume, err := b.GetVolume(novelId, volumeId, skipChapterContent)
if err != nil {
b.logger.Error("failed to get volume info", slog.Int("novelId", novelId), slog.Int("volumeId", volumeId), slog.Any("error", err))
return
}
volume.SeriesIdx = i
// 关闭浏览器标签页
pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
if pwPage, ok := b.pages[pwPageKey]; ok {
_ = pwPage.Close()
delete(b.pages, pwPageKey)
}
mu.Lock()
volumes[i] = volume
mu.Unlock()
}(i, volumeIdStr)
}
wg.Wait()
// 过滤掉获取失败的 nil volume
filteredVolumes := make([]*model.Volume, 0, len(volumes))
for _, vol := range volumes {
if vol != nil {
filteredVolumes = append(filteredVolumes, vol)
}
}
return filteredVolumes, nil
}
func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) {
b.logger.Info("Getting chapter of novel", slog.Int("chapterId", chapterId), slog.Int("novelId", novelId))
pageNum := 1
chapter := &model.Chapter{
Id: chapterId,
NovelId: novelId,
VolumeId: volumeId,
Url: fmt.Sprintf("https://www.bilinovel.com/novel/%v/%v.html", novelId, chapterId),
}
for {
pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
if _, ok := b.pages[pwPageKey]; !ok {
pwPage, err := b.browserContext.NewPage()
if err != nil {
return nil, fmt.Errorf("failed to create browser page: %w", err)
}
b.pages[pwPageKey] = pwPage
}
hasNext, err := b.getChapterByPage(b.pages[pwPageKey], chapter, pageNum)
if err != nil {
return nil, fmt.Errorf("failed to download chapter: %w", err)
}
if !hasNext {
break
}
pageNum++
}
return chapter, nil
}
var nextPageUrlRegexp = regexp.MustCompile(`url_next:\s?['"]([^'"]*?)['"]`)
var cleanNextPageUrlRegexp = regexp.MustCompile(`(_\d+)?\.html$`)
func (b *Bilinovel) getChapterByPage(pwPage playwright.Page, chapter *model.Chapter, pageNum int) (bool, error) {
b.logger.Info("Getting chapter by page", slog.Int("chapter", chapter.Id), slog.Int("page", pageNum))
Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", pageNum)
hasNext := false
headers := map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,zh-TW;q=0.6",
"Cookie": "night=1;",
}
resp, err := b.restyClient.R().SetHeaders(headers).Get(Url)
if err != nil {
return false, fmt.Errorf("failed to get chapter: %w", err)
}
if resp.StatusCode() != http.StatusOK {
return false, fmt.Errorf("failed to get chapter: %v", resp.Status())
}
if strings.Contains(resp.String(), `<a onclick="window.location.href = ReadParams.url_next;">下一頁</a>`) {
hasNext = true
}
html := resp.Body()
// 解决乱序问题
resortedHtml, err := b.processContentWithPlaywright(pwPage, string(html))
if err != nil {
return false, fmt.Errorf("failed to process html: %w", err)
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(resortedHtml))
if err != nil {
return false, fmt.Errorf("failed to parse html: %w", err)
}
// 判断章节是否有下一页
n := nextPageUrlRegexp.FindStringSubmatch(resortedHtml)
if len(n) != 2 {
return false, fmt.Errorf("failed to determine wether there is a next page")
}
s := cleanNextPageUrlRegexp.ReplaceAllString(n[1], "")
if strings.Contains(Url, s) {
hasNext = true
}
if pageNum == 1 {
chapter.Title = doc.Find("#atitle").Text()
}
content := doc.Find("#acontent").First()
content.Find(".cgo").Remove()
content.Find("center").Remove()
content.Find(".google-auto-placed").Remove()
if strings.Contains(resortedHtml, `font-family: "read"`) {
html, err := content.Find("p").Last().Html()
if err != nil {
return false, fmt.Errorf("failed to get html: %v", err)
}
builder := strings.Builder{}
for _, r := range html {
_, newRune, ok := b.fontMapper.MappingRune(r)
if ok {
builder.WriteRune(newRune)
}
}
content.Find("p").Last().SetHtml(builder.String())
}
if b.textOnly {
content.Find("img").Remove()
} else {
content.Find("img").Each(func(i int, s *goquery.Selection) {
imgUrl := s.AttrOr("data-src", "")
if imgUrl == "" {
imgUrl = s.AttrOr("src", "")
if imgUrl == "" {
return
}
}
imageHash := sha256.Sum256([]byte(imgUrl))
imageFilename := fmt.Sprintf("%x%s", string(imageHash[:]), path.Ext(imgUrl))
s.SetAttr("src", imageFilename)
s.SetAttr("alt", imgUrl)
s.RemoveAttr("class")
img, err := b.getImg(imgUrl)
if err != nil {
return
}
if chapter.Content == nil {
chapter.Content = &model.ChaperContent{}
}
if chapter.Content.Images == nil {
chapter.Content.Images = make(map[string][]byte)
}
chapter.Content.Images[imageFilename] = img
})
}
doc.Find("*").Each(func(i int, s *goquery.Selection) {
if len(s.Nodes) > 0 && len(s.Nodes[0].Attr) > 0 {
// 遍历元素的所有属性
for _, attr := range s.Nodes[0].Attr {
// 3. 检查属性名是否以 "data-k" 开头,且属性值是否为空
if strings.HasPrefix(attr.Key, "data-k") {
// 4. 如果满足条件,就移除这个属性
s.RemoveAttr(attr.Key)
}
}
}
})
htmlStr, err := content.Html()
if err != nil {
return false, fmt.Errorf("failed to get html: %v", err)
}
if chapter.Content == nil {
chapter.Content = &model.ChaperContent{}
}
chapter.Content.Html += strings.TrimSpace(htmlStr)
return hasNext, nil
}
func (b *Bilinovel) getImg(url string) ([]byte, error) {
b.logger.Info("Getting img", slog.String("url", url))
resp, err := b.restyClient.R().SetHeader("Referer", "https://www.bilinovel.com").Get(url)
if err != nil {
return nil, err
}
return resp.Body(), nil
}
// processContentWithPlaywright 使用复用的浏览器实例处理内容
func (b *Bilinovel) processContentWithPlaywright(page playwright.Page, htmlContent string) (string, error) {
// 替换 window.location.replace防止页面跳转
htmlContent = strings.ReplaceAll(htmlContent, "window.location.replace", "console.log")
tempPath := filepath.Join(os.TempDir(), "bilinovel-downloader")
err := os.MkdirAll(tempPath, 0755)
if err != nil {
return "", fmt.Errorf("failed to create temp dir: %w", err)
}
tempFile, err := os.CreateTemp(tempPath, "temp-*.html")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
_, err = tempFile.WriteString(htmlContent)
if err != nil {
return "", fmt.Errorf("failed to write temp file: %w", err)
}
tempFile.Close()
tempFilePath := tempFile.Name()
// // 屏蔽请求
// googleAdsDomains := []string{
// "adtrafficquality.google",
// "doubleclick.net",
// "googlesyndication.com",
// "googletagmanager.com",
// "hm.baidu.com",
// "cloudflareinsights.com",
// "fsdoa.js", // adblock 检测
// "https://www.linovelib.com/novel/", // 阻止从本地文件跳转到在线页面
// }
// err = page.Route("**/*", func(route playwright.Route) {
// for _, d := range googleAdsDomains {
// if strings.Contains(route.Request().URL(), d) {
// b.logger.Debug("blocking request", slog.String("url", route.Request().URL()))
// err := route.Abort("aborted")
// if err != nil {
// b.logger.Debug("failed to block request", route.Request().URL(), err)
// }
// return
// }
// }
// _ = route.Continue()
// })
// if err != nil {
// return "", fmt.Errorf("failed to intercept requests: %w", err)
// }
_, err = page.ExpectResponse(func(url string) bool {
return strings.Contains(url, "chapterlog.js")
}, func() error {
_, err = page.Goto("file://" + filepath.ToSlash(tempFilePath))
if err != nil {
return fmt.Errorf("could not navigate to file: %w", err)
}
return nil
}, playwright.PageExpectResponseOptions{
Timeout: playwright.Float(10000),
})
if err != nil {
return "", fmt.Errorf("failed to wait for network request finish")
}
err = page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{
State: playwright.WaitForSelectorStateVisible,
Timeout: playwright.Float(10000),
})
if err != nil {
return "", fmt.Errorf("could not wait for #acontent: %w", err)
}
// 遍历所有 #acontent 的子元素, 通过 window.getComputedStyle().display 检测是否是 none, 如果是 none 则从页面删除这个元素
result, err := page.Evaluate(`
(function() {
const acontent = document.getElementById('acontent');
if (!acontent) {
return 'acontent element not found';
}
let removedCount = 0;
const elements = acontent.querySelectorAll('*');
// 从后往前遍历,避免删除元素时影响索引
for (let i = elements.length - 1; i >= 0; i--) {
const element = elements[i];
const computedStyle = window.getComputedStyle(element);
if (computedStyle.display === 'none' || computedStyle.transform == 'matrix(0, 0, 0, 0, 0, 0)') {
element.remove();
removedCount++;
}
}
return 'Removed ' + removedCount + ' hidden elements';
})()
`)
if err != nil {
return "", fmt.Errorf("failed to remove hidden elements: %w", err)
}
b.logger.Debug("Hidden elements removal result", slog.Any("count", result))
processedHTML, err := page.Content()
if err != nil {
return "", fmt.Errorf("could not get page content: %w", err)
}
return processedHTML, nil
}