feat: Add concurrency and headless options for downloads

This commit introduces new features for controlling the download process:

-   **Concurrency**: Users can now specify the number of concurrent volume downloads using the `--concurrency` flag. This significantly speeds up the download of entire novels.
-   **Headless Mode**: A `--headless` flag has been added to control whether the browser operates in headless mode (without a visible UI). This is useful for debugging or running in environments without a display.

**Changes include:**

-   Updated `download` command to accept `--concurrency` and `--headless` flags.
-   Refactored `bilinovel` downloader to support `BilinovelNewOption` for configuring headless mode and concurrency.
-   Implemented a page pool and concurrency control mechanism within the `bilinovel` downloader to manage concurrent browser page usage.
-   Added `DownloadNovel` and `DownloadVolume` methods to the `bilinovel` downloader, utilizing goroutines and wait groups for parallel processing.
-   Updated `.vscode/launch.json` with new configurations for testing novel and volume downloads with the new options.
This commit is contained in:
2025-10-06 10:20:36 +11:00
parent 6084386989
commit b0f8f31dcc
8 changed files with 209 additions and 87 deletions

21
.vscode/launch.json vendored
View File

@@ -2,7 +2,7 @@
"version": "0.2.0",
"configurations": [
{
"name": "download",
"name": "novel",
"type": "go",
"request": "launch",
"mode": "auto",
@@ -10,9 +10,22 @@
"args": [
"download",
"-n",
"2321",
"-v",
"162759"
"2727",
"--concurrency",
"5"
]
},
{
"name": "volume",
"type": "go",
"request": "launch",
"mode": "auto",
"program": "${workspaceFolder}",
"args": [
"download",
"-n=2727",
"-v=150098",
"--headless=false"
]
}
]

View File

@@ -1,6 +1,7 @@
package cmd
import (
"bilinovel-downloader/downloader"
"bilinovel-downloader/downloader/bilinovel"
"bilinovel-downloader/epub"
"bilinovel-downloader/model"
@@ -27,10 +28,12 @@ var downloadCmd = &cobra.Command{
}
type downloadCmdArgs struct {
NovelId int `validate:"required"`
VolumeId int `validate:"required"`
outputPath string
outputType string
NovelId int `validate:"required"`
VolumeId int `validate:"required"`
outputPath string
outputType string
headless bool
concurrency int
}
var (
@@ -42,11 +45,16 @@ func init() {
downloadCmd.Flags().IntVarP(&downloadArgs.VolumeId, "volume-id", "v", 0, "volume id")
downloadCmd.Flags().StringVarP(&downloadArgs.outputPath, "output-path", "o", "novels", "output path")
downloadCmd.Flags().StringVarP(&downloadArgs.outputType, "output-type", "t", "epub", "output type, epub or text")
downloadCmd.Flags().BoolVar(&downloadArgs.headless, "headless", true, "headless mode")
downloadCmd.Flags().IntVar(&downloadArgs.concurrency, "concurrency", 3, "concurrency of downloading volumes")
RootCmd.AddCommand(downloadCmd)
}
func runDownloadNovel() error {
downloader, err := bilinovel.New()
downloader, err := bilinovel.New(bilinovel.BilinovelNewOption{
Headless: downloadArgs.headless,
Concurrency: downloadArgs.concurrency,
})
if err != nil {
return fmt.Errorf("failed to create downloader: %v", err)
}
@@ -63,16 +71,10 @@ func runDownloadNovel() error {
if downloadArgs.VolumeId == 0 {
// 下载整本小说
novel, err := downloader.GetNovel(downloadArgs.NovelId, true)
err := downloadNovel(downloader, downloadArgs.NovelId)
if err != nil {
return fmt.Errorf("failed to get novel: %v", err)
}
for _, volume := range novel.Volumes {
err = downloadVolume(downloader, volume.Id)
if err != nil {
return fmt.Errorf("failed to download volume: %v", err)
}
}
} else {
// 下载单卷
err = downloadVolume(downloader, downloadArgs.VolumeId)
@@ -84,7 +86,59 @@ func runDownloadNovel() error {
return nil
}
func downloadVolume(downloader model.Downloader, volumeId int) error {
func downloadNovel(downloader downloader.Downloader, novelId int) error {
novelInfo, err := downloader.GetNovel(novelId, true, nil)
if err != nil {
return fmt.Errorf("failed to get novel info: %w", err)
}
skipVolumes := make([]int, 0)
for _, volume := range novelInfo.Volumes {
jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volume.Id))
err = os.MkdirAll(filepath.Dir(jsonPath), 0755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
_, err = os.Stat(jsonPath)
if err == nil {
// 已经下载
skipVolumes = append(skipVolumes, volume.Id)
}
}
novel, err := downloader.GetNovel(novelId, false, skipVolumes)
if err != nil {
return fmt.Errorf("failed to download novel: %w", err)
}
for _, volume := range novel.Volumes {
jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volume.Id))
err = os.MkdirAll(filepath.Dir(jsonPath), 0755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
jsonFile, err := os.Create(jsonPath)
if err != nil {
return fmt.Errorf("failed to create json file: %v", err)
}
err = json.NewEncoder(jsonFile).Encode(volume)
if err != nil {
return fmt.Errorf("failed to encode json file: %v", err)
}
switch downloadArgs.outputType {
case "epub":
err = epub.PackVolumeToEpub(volume, downloadArgs.outputPath, downloader.GetStyleCSS(), downloader.GetExtraFiles())
if err != nil {
return fmt.Errorf("failed to pack volume: %v", err)
}
case "text":
err = text.PackVolumeToText(volume, downloadArgs.outputPath)
if err != nil {
return fmt.Errorf("failed to pack volume: %v", err)
}
}
}
return nil
}
func downloadVolume(downloader downloader.Downloader, volumeId int) error {
jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volumeId))
err := os.MkdirAll(filepath.Dir(jsonPath), 0755)
if err != nil {

View File

@@ -13,8 +13,10 @@ import (
"path"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
mapper "github.com/bestnite/font-mapper"
@@ -35,10 +37,17 @@ type Bilinovel struct {
// 浏览器实例复用
browser playwright.Browser
browserContext playwright.BrowserContext
page playwright.Page
pages map[string]playwright.Page
concurrency int
concurrentChan chan any
}
func New() (*Bilinovel, error) {
type BilinovelNewOption struct {
Headless bool
Concurrency int
}
func New(option BilinovelNewOption) (*Bilinovel, error) {
fontMapper, err := mapper.NewGlyphOutlineMapper(readTTF, miLantingTTF)
if err != nil {
return nil, fmt.Errorf("failed to create font mapper: %v", err)
@@ -46,13 +55,16 @@ func New() (*Bilinovel, error) {
restyClient := utils.NewRestyClient(50)
b := &Bilinovel{
fontMapper: fontMapper,
textOnly: false,
restyClient: restyClient,
fontMapper: fontMapper,
textOnly: false,
restyClient: restyClient,
pages: make(map[string]playwright.Page),
concurrency: option.Concurrency,
concurrentChan: make(chan any, option.Concurrency),
}
// 初始化浏览器实例
err = b.initBrowser()
err = b.initBrowser(option.Headless)
if err != nil {
return nil, fmt.Errorf("failed to init browser: %v", err)
}
@@ -69,13 +81,13 @@ func (b *Bilinovel) GetExtraFiles() []model.ExtraFile {
}
// initBrowser 初始化浏览器实例
func (b *Bilinovel) initBrowser() error {
func (b *Bilinovel) initBrowser(headless bool) error {
pw, err := playwright.Run()
if err != nil {
return fmt.Errorf("could not start playwright: %w", err)
}
b.browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(false),
Headless: playwright.Bool(headless),
})
if err != nil {
return fmt.Errorf("could not launch browser: %w", err)
@@ -86,11 +98,6 @@ func (b *Bilinovel) initBrowser() error {
return fmt.Errorf("could not create browser context: %w", err)
}
b.page, err = b.browserContext.NewPage()
if err != nil {
return fmt.Errorf("could not create page: %w", err)
}
log.Println("Browser initialized successfully")
return nil
}
@@ -103,7 +110,6 @@ func (b *Bilinovel) Close() error {
}
b.browser = nil
b.browserContext = nil
b.page = nil
}
return nil
}
@@ -115,7 +121,7 @@ func (b *Bilinovel) GetStyleCSS() string {
return string(styleCSS)
}
func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error) {
func (b *Bilinovel) GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error) {
log.Printf("Getting novel %v\n", novelId)
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v.html", novelId)
@@ -145,7 +151,7 @@ func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error
novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
})
volumes, err := b.getAllVolumes(novelId, skipChapter)
volumes, err := b.getAllVolumes(novelId, skipChapterContent, skipVolumes)
if err != nil {
return nil, fmt.Errorf("failed to get novel volumes: %v", err)
}
@@ -154,7 +160,7 @@ func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error
return novel, nil
}
func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*model.Volume, error) {
func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error) {
log.Printf("Getting volume %v of novel %v\n", volumeId, novelId)
novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
@@ -229,7 +235,7 @@ func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*mod
idRegexp := regexp.MustCompile(`/novel/(\d+)/(\d+).html`)
if !skipChapter {
if !skipChapterContent {
for i := range volume.Chapters {
matches := idRegexp.FindStringSubmatch(volume.Chapters[i].Url)
if len(matches) > 0 {
@@ -252,7 +258,7 @@ func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*mod
return volume, nil
}
func (b *Bilinovel) getAllVolumes(novelId int, skipChapter bool) ([]*model.Volume, error) {
func (b *Bilinovel) getAllVolumes(novelId int, skipChapterContent bool, skipVolumes []int) ([]*model.Volume, error) {
log.Printf("Getting all volumes of novel %v\n", novelId)
catelogUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
@@ -280,27 +286,63 @@ func (b *Bilinovel) getAllVolumes(novelId int, skipChapter bool) ([]*model.Volum
}
})
volumes := make([]*model.Volume, 0)
volumes := make([]*model.Volume, len(volumeIds))
var wg sync.WaitGroup
var mu sync.Mutex // 保护 volumes 写入的互斥锁
for i, volumeIdStr := range volumeIds {
volumeId, err := strconv.Atoi(volumeIdStr)
if err != nil {
return nil, fmt.Errorf("failed to convert volume id: %v", err)
}
volume, err := b.GetVolume(novelId, volumeId, skipChapter)
if err != nil {
return nil, fmt.Errorf("failed to get volume info: %v", err)
}
volume.SeriesIdx = i
volumes = append(volumes, volume)
wg.Add(1)
b.concurrentChan <- struct{}{} // 获取一个并发槽
go func(i int, volumeIdStr string) {
defer wg.Done()
defer func() { <-b.concurrentChan }() // 释放并发槽
volumeId, err := strconv.Atoi(volumeIdStr)
if err != nil {
log.Printf("failed to convert volume id %s: %v", volumeIdStr, err)
return
}
if slices.Contains(skipVolumes, volumeId) {
return
}
volume, err := b.GetVolume(novelId, volumeId, skipChapterContent)
if err != nil {
log.Printf("failed to get volume info for novel %d, volume %d: %v", novelId, volumeId, err)
return
}
volume.SeriesIdx = i
// 关闭浏览器标签页
pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
if pwPage, ok := b.pages[pwPageKey]; ok {
_ = pwPage.Close()
delete(b.pages, pwPageKey)
}
mu.Lock()
volumes[i] = volume
mu.Unlock()
}(i, volumeIdStr)
}
return volumes, nil
wg.Wait()
// 过滤掉获取失败的 nil volume
filteredVolumes := make([]*model.Volume, 0, len(volumes))
for _, vol := range volumes {
if vol != nil {
filteredVolumes = append(filteredVolumes, vol)
}
}
return filteredVolumes, nil
}
func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) {
log.Printf("Getting chapter %v of novel %v\n", chapterId, novelId)
page := 1
pageNum := 1
chapter := &model.Chapter{
Id: chapterId,
NovelId: novelId,
@@ -308,22 +350,30 @@ func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model
Url: fmt.Sprintf("https://www.bilinovel.com/novel/%v/%v.html", novelId, chapterId),
}
for {
hasNext, err := b.getChapterByPage(chapter, page)
pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
if _, ok := b.pages[pwPageKey]; !ok {
pwPage, err := b.browserContext.NewPage()
if err != nil {
return nil, fmt.Errorf("failed to create browser page: %w", err)
}
b.pages[pwPageKey] = pwPage
}
hasNext, err := b.getChapterByPage(b.pages[pwPageKey], chapter, pageNum)
if err != nil {
return nil, fmt.Errorf("failed to download chapter: %w", err)
}
if !hasNext {
break
}
page++
pageNum++
}
return chapter, nil
}
func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, error) {
log.Printf("Getting chapter %v by page %v\n", chapter.Id, page)
func (b *Bilinovel) getChapterByPage(pwPage playwright.Page, chapter *model.Chapter, pageNum int) (bool, error) {
log.Printf("Getting chapter %v by page %v\n", chapter.Id, pageNum)
Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", page)
Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", pageNum)
hasNext := false
headers := map[string]string{
@@ -344,8 +394,9 @@ func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, er
}
html := resp.Body()
// 解决乱序问题
resortedHtml, err := b.processContentWithPlaywright(string(html))
resortedHtml, err := b.processContentWithPlaywright(pwPage, string(html))
if err != nil {
return false, fmt.Errorf("failed to process html: %w", err)
}
@@ -354,7 +405,7 @@ func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, er
return false, fmt.Errorf("failed to parse html: %w", err)
}
if page == 1 {
if pageNum == 1 {
chapter.Title = doc.Find("#atitle").Text()
}
content := doc.Find("#acontent").First()
@@ -445,7 +496,7 @@ func (b *Bilinovel) getImg(url string) ([]byte, error) {
}
// processContentWithPlaywright 使用复用的浏览器实例处理内容
func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, error) {
func (b *Bilinovel) processContentWithPlaywright(page playwright.Page, htmlContent string) (string, error) {
tempFile, err := os.CreateTemp("", "bilinovel-temp-*.html")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
@@ -459,10 +510,10 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er
tempFile.Close()
tempFilePath := tempFile.Name()
_, err = b.page.ExpectResponse(func(url string) bool {
_, err = page.ExpectResponse(func(url string) bool {
return strings.Contains(url, "chapterlog.js")
}, func() error {
_, err = b.page.Goto("file://" + filepath.ToSlash(tempFilePath))
_, err = page.Goto("file://" + filepath.ToSlash(tempFilePath))
if err != nil {
return fmt.Errorf("could not navigate to file: %w", err)
}
@@ -474,7 +525,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er
return "", fmt.Errorf("failed to wait for network request finish")
}
err = b.page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{
err = page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{
State: playwright.WaitForSelectorStateVisible,
})
if err != nil {
@@ -482,7 +533,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er
}
// 遍历所有 #acontent 的子元素, 通过 window.getComputedStyle().display 检测是否是 none, 如果是 none 则从页面删除这个元素
result, err := b.page.Evaluate(`
result, err := page.Evaluate(`
(function() {
const acontent = document.getElementById('acontent');
if (!acontent) {
@@ -497,7 +548,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er
const element = elements[i];
const computedStyle = window.getComputedStyle(element);
if (computedStyle.display === 'none') {
if (computedStyle.display === 'none' || computedStyle.transform == 'matrix(0, 0, 0, 0, 0, 0)') {
element.remove();
removedCount++;
}
@@ -513,7 +564,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er
log.Printf("Hidden elements removal result: %s", result)
processedHTML, err := b.page.Content()
processedHTML, err := page.Content()
if err != nil {
return "", fmt.Errorf("could not get page content: %w", err)
}

12
downloader/downloader.go Normal file
View File

@@ -0,0 +1,12 @@
package downloader
import "bilinovel-downloader/model"
type Downloader interface {
GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error)
GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error)
GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error)
GetStyleCSS() string
GetExtraFiles() []model.ExtraFile
Close() error
}

View File

@@ -339,7 +339,7 @@ func addDirContentToZip(zipWriter *zip.Writer, dirPath string, method uint16) er
return err
}
relPath = filepath.ToSlash(relPath)
relPath = filepath.ToSlash(relPath)
file, err := os.Open(filePath)
if err != nil {

View File

@@ -1,16 +0,0 @@
package model
type ExtraFile struct {
Data []byte
Path string
ManifestItem ManifestItem
}
type Downloader interface {
GetNovel(novelId int, skipChapter bool) (*Novel, error)
GetVolume(novelId int, volumeId int, skipChapter bool) (*Volume, error)
GetChapter(novelId int, volumeId int, chapterId int) (*Chapter, error)
GetStyleCSS() string
GetExtraFiles() []ExtraFile
Close() error
}

View File

@@ -1,6 +1,14 @@
package model
import "encoding/xml"
import (
"encoding/xml"
)
type ExtraFile struct {
Data []byte
Path string
ManifestItem ManifestItem
}
type DublinCoreMetadata struct {
XMLName xml.Name `xml:"metadata"`

View File

@@ -8,12 +8,12 @@ import (
)
func TestBilinovel_GetNovel(t *testing.T) {
bilinovel, err := bilinovel.New()
bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 5})
bilinovel.SetTextOnly(true)
if err != nil {
t.Fatalf("failed to create bilinovel: %v", err)
}
novel, err := bilinovel.GetNovel(4519, false)
novel, err := bilinovel.GetNovel(2727, false, nil)
if err != nil {
t.Fatalf("failed to get novel: %v", err)
}
@@ -25,12 +25,12 @@ func TestBilinovel_GetNovel(t *testing.T) {
}
func TestBilinovel_GetVolume(t *testing.T) {
bilinovel, err := bilinovel.New()
bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 1})
bilinovel.SetTextOnly(true)
if err != nil {
t.Fatalf("failed to create bilinovel: %v", err)
}
volume, err := bilinovel.GetVolume(1410, 52748, false)
volume, err := bilinovel.GetVolume(2727, 129092, false)
if err != nil {
t.Fatalf("failed to get volume: %v", err)
}
@@ -42,12 +42,12 @@ func TestBilinovel_GetVolume(t *testing.T) {
}
func TestBilinovel_GetChapter(t *testing.T) {
bilinovel, err := bilinovel.New()
bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 1})
bilinovel.SetTextOnly(true)
if err != nil {
t.Fatalf("failed to create bilinovel: %v", err)
}
chapter, err := bilinovel.GetChapter(3095, 154930, 154933)
chapter, err := bilinovel.GetChapter(2727, 129092, 129094)
if err != nil {
t.Fatalf("failed to get chapter: %v", err)
}