diff --git a/.vscode/launch.json b/.vscode/launch.json index 76707b0..3dd9a89 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -2,7 +2,7 @@ "version": "0.2.0", "configurations": [ { - "name": "download", + "name": "novel", "type": "go", "request": "launch", "mode": "auto", @@ -10,9 +10,22 @@ "args": [ "download", "-n", - "2321", - "-v", - "162759" + "2727", + "--concurrency", + "5" + ] + }, + { + "name": "volume", + "type": "go", + "request": "launch", + "mode": "auto", + "program": "${workspaceFolder}", + "args": [ + "download", + "-n=2727", + "-v=150098", + "--headless=false" ] } ] diff --git a/cmd/download.go b/cmd/download.go index 75106c8..3d39c79 100644 --- a/cmd/download.go +++ b/cmd/download.go @@ -1,6 +1,7 @@ package cmd import ( + "bilinovel-downloader/downloader" "bilinovel-downloader/downloader/bilinovel" "bilinovel-downloader/epub" "bilinovel-downloader/model" @@ -27,10 +28,12 @@ var downloadCmd = &cobra.Command{ } type downloadCmdArgs struct { - NovelId int `validate:"required"` - VolumeId int `validate:"required"` - outputPath string - outputType string + NovelId int `validate:"required"` + VolumeId int `validate:"required"` + outputPath string + outputType string + headless bool + concurrency int } var ( @@ -42,11 +45,16 @@ func init() { downloadCmd.Flags().IntVarP(&downloadArgs.VolumeId, "volume-id", "v", 0, "volume id") downloadCmd.Flags().StringVarP(&downloadArgs.outputPath, "output-path", "o", "novels", "output path") downloadCmd.Flags().StringVarP(&downloadArgs.outputType, "output-type", "t", "epub", "output type, epub or text") + downloadCmd.Flags().BoolVar(&downloadArgs.headless, "headless", true, "headless mode") + downloadCmd.Flags().IntVar(&downloadArgs.concurrency, "concurrency", 3, "concurrency of downloading volumes") RootCmd.AddCommand(downloadCmd) } func runDownloadNovel() error { - downloader, err := bilinovel.New() + downloader, err := bilinovel.New(bilinovel.BilinovelNewOption{ + Headless: downloadArgs.headless, + Concurrency: downloadArgs.concurrency, + }) if err != nil { return fmt.Errorf("failed to create downloader: %v", err) } @@ -63,16 +71,10 @@ func runDownloadNovel() error { if downloadArgs.VolumeId == 0 { // 下载整本小说 - novel, err := downloader.GetNovel(downloadArgs.NovelId, true) + err := downloadNovel(downloader, downloadArgs.NovelId) if err != nil { return fmt.Errorf("failed to get novel: %v", err) } - for _, volume := range novel.Volumes { - err = downloadVolume(downloader, volume.Id) - if err != nil { - return fmt.Errorf("failed to download volume: %v", err) - } - } } else { // 下载单卷 err = downloadVolume(downloader, downloadArgs.VolumeId) @@ -84,7 +86,59 @@ func runDownloadNovel() error { return nil } -func downloadVolume(downloader model.Downloader, volumeId int) error { +func downloadNovel(downloader downloader.Downloader, novelId int) error { + novelInfo, err := downloader.GetNovel(novelId, true, nil) + if err != nil { + return fmt.Errorf("failed to get novel info: %w", err) + } + skipVolumes := make([]int, 0) + for _, volume := range novelInfo.Volumes { + jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volume.Id)) + err = os.MkdirAll(filepath.Dir(jsonPath), 0755) + if err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + _, err = os.Stat(jsonPath) + if err == nil { + // 已经下载 + skipVolumes = append(skipVolumes, volume.Id) + } + } + novel, err := downloader.GetNovel(novelId, false, skipVolumes) + if err != nil { + return fmt.Errorf("failed to download novel: %w", err) + } + for _, volume := range novel.Volumes { + jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volume.Id)) + err = os.MkdirAll(filepath.Dir(jsonPath), 0755) + if err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + jsonFile, err := os.Create(jsonPath) + if err != nil { + return fmt.Errorf("failed to create json file: %v", err) + } + err = json.NewEncoder(jsonFile).Encode(volume) + if err != nil { + return fmt.Errorf("failed to encode json file: %v", err) + } + switch downloadArgs.outputType { + case "epub": + err = epub.PackVolumeToEpub(volume, downloadArgs.outputPath, downloader.GetStyleCSS(), downloader.GetExtraFiles()) + if err != nil { + return fmt.Errorf("failed to pack volume: %v", err) + } + case "text": + err = text.PackVolumeToText(volume, downloadArgs.outputPath) + if err != nil { + return fmt.Errorf("failed to pack volume: %v", err) + } + } + } + return nil +} + +func downloadVolume(downloader downloader.Downloader, volumeId int) error { jsonPath := filepath.Join(downloadArgs.outputPath, fmt.Sprintf("volume-%d-%d.json", downloadArgs.NovelId, volumeId)) err := os.MkdirAll(filepath.Dir(jsonPath), 0755) if err != nil { diff --git a/downloader/bilinovel/bilinovel.go b/downloader/bilinovel/bilinovel.go index fc8b95e..56f5b93 100644 --- a/downloader/bilinovel/bilinovel.go +++ b/downloader/bilinovel/bilinovel.go @@ -13,8 +13,10 @@ import ( "path" "path/filepath" "regexp" + "slices" "strconv" "strings" + "sync" "github.com/PuerkitoBio/goquery" mapper "github.com/bestnite/font-mapper" @@ -35,10 +37,17 @@ type Bilinovel struct { // 浏览器实例复用 browser playwright.Browser browserContext playwright.BrowserContext - page playwright.Page + pages map[string]playwright.Page + concurrency int + concurrentChan chan any } -func New() (*Bilinovel, error) { +type BilinovelNewOption struct { + Headless bool + Concurrency int +} + +func New(option BilinovelNewOption) (*Bilinovel, error) { fontMapper, err := mapper.NewGlyphOutlineMapper(readTTF, miLantingTTF) if err != nil { return nil, fmt.Errorf("failed to create font mapper: %v", err) @@ -46,13 +55,16 @@ func New() (*Bilinovel, error) { restyClient := utils.NewRestyClient(50) b := &Bilinovel{ - fontMapper: fontMapper, - textOnly: false, - restyClient: restyClient, + fontMapper: fontMapper, + textOnly: false, + restyClient: restyClient, + pages: make(map[string]playwright.Page), + concurrency: option.Concurrency, + concurrentChan: make(chan any, option.Concurrency), } // 初始化浏览器实例 - err = b.initBrowser() + err = b.initBrowser(option.Headless) if err != nil { return nil, fmt.Errorf("failed to init browser: %v", err) } @@ -69,13 +81,13 @@ func (b *Bilinovel) GetExtraFiles() []model.ExtraFile { } // initBrowser 初始化浏览器实例 -func (b *Bilinovel) initBrowser() error { +func (b *Bilinovel) initBrowser(headless bool) error { pw, err := playwright.Run() if err != nil { return fmt.Errorf("could not start playwright: %w", err) } b.browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ - Headless: playwright.Bool(false), + Headless: playwright.Bool(headless), }) if err != nil { return fmt.Errorf("could not launch browser: %w", err) @@ -86,11 +98,6 @@ func (b *Bilinovel) initBrowser() error { return fmt.Errorf("could not create browser context: %w", err) } - b.page, err = b.browserContext.NewPage() - if err != nil { - return fmt.Errorf("could not create page: %w", err) - } - log.Println("Browser initialized successfully") return nil } @@ -103,7 +110,6 @@ func (b *Bilinovel) Close() error { } b.browser = nil b.browserContext = nil - b.page = nil } return nil } @@ -115,7 +121,7 @@ func (b *Bilinovel) GetStyleCSS() string { return string(styleCSS) } -func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error) { +func (b *Bilinovel) GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error) { log.Printf("Getting novel %v\n", novelId) novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v.html", novelId) @@ -145,7 +151,7 @@ func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text())) }) - volumes, err := b.getAllVolumes(novelId, skipChapter) + volumes, err := b.getAllVolumes(novelId, skipChapterContent, skipVolumes) if err != nil { return nil, fmt.Errorf("failed to get novel volumes: %v", err) } @@ -154,7 +160,7 @@ func (b *Bilinovel) GetNovel(novelId int, skipChapter bool) (*model.Novel, error return novel, nil } -func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*model.Volume, error) { +func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error) { log.Printf("Getting volume %v of novel %v\n", volumeId, novelId) novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId) @@ -229,7 +235,7 @@ func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*mod idRegexp := regexp.MustCompile(`/novel/(\d+)/(\d+).html`) - if !skipChapter { + if !skipChapterContent { for i := range volume.Chapters { matches := idRegexp.FindStringSubmatch(volume.Chapters[i].Url) if len(matches) > 0 { @@ -252,7 +258,7 @@ func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapter bool) (*mod return volume, nil } -func (b *Bilinovel) getAllVolumes(novelId int, skipChapter bool) ([]*model.Volume, error) { +func (b *Bilinovel) getAllVolumes(novelId int, skipChapterContent bool, skipVolumes []int) ([]*model.Volume, error) { log.Printf("Getting all volumes of novel %v\n", novelId) catelogUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId) @@ -280,27 +286,63 @@ func (b *Bilinovel) getAllVolumes(novelId int, skipChapter bool) ([]*model.Volum } }) - volumes := make([]*model.Volume, 0) + volumes := make([]*model.Volume, len(volumeIds)) + var wg sync.WaitGroup + var mu sync.Mutex // 保护 volumes 写入的互斥锁 + for i, volumeIdStr := range volumeIds { - volumeId, err := strconv.Atoi(volumeIdStr) - if err != nil { - return nil, fmt.Errorf("failed to convert volume id: %v", err) - } - volume, err := b.GetVolume(novelId, volumeId, skipChapter) - if err != nil { - return nil, fmt.Errorf("failed to get volume info: %v", err) - } - volume.SeriesIdx = i - volumes = append(volumes, volume) + wg.Add(1) + b.concurrentChan <- struct{}{} // 获取一个并发槽 + + go func(i int, volumeIdStr string) { + defer wg.Done() + defer func() { <-b.concurrentChan }() // 释放并发槽 + + volumeId, err := strconv.Atoi(volumeIdStr) + if err != nil { + log.Printf("failed to convert volume id %s: %v", volumeIdStr, err) + return + } + if slices.Contains(skipVolumes, volumeId) { + return + } + volume, err := b.GetVolume(novelId, volumeId, skipChapterContent) + if err != nil { + log.Printf("failed to get volume info for novel %d, volume %d: %v", novelId, volumeId, err) + return + } + volume.SeriesIdx = i + + // 关闭浏览器标签页 + pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId) + if pwPage, ok := b.pages[pwPageKey]; ok { + _ = pwPage.Close() + delete(b.pages, pwPageKey) + } + + mu.Lock() + volumes[i] = volume + mu.Unlock() + }(i, volumeIdStr) } - return volumes, nil + wg.Wait() + + // 过滤掉获取失败的 nil volume + filteredVolumes := make([]*model.Volume, 0, len(volumes)) + for _, vol := range volumes { + if vol != nil { + filteredVolumes = append(filteredVolumes, vol) + } + } + + return filteredVolumes, nil } func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) { log.Printf("Getting chapter %v of novel %v\n", chapterId, novelId) - page := 1 + pageNum := 1 chapter := &model.Chapter{ Id: chapterId, NovelId: novelId, @@ -308,22 +350,30 @@ func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model Url: fmt.Sprintf("https://www.bilinovel.com/novel/%v/%v.html", novelId, chapterId), } for { - hasNext, err := b.getChapterByPage(chapter, page) + pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId) + if _, ok := b.pages[pwPageKey]; !ok { + pwPage, err := b.browserContext.NewPage() + if err != nil { + return nil, fmt.Errorf("failed to create browser page: %w", err) + } + b.pages[pwPageKey] = pwPage + } + hasNext, err := b.getChapterByPage(b.pages[pwPageKey], chapter, pageNum) if err != nil { return nil, fmt.Errorf("failed to download chapter: %w", err) } if !hasNext { break } - page++ + pageNum++ } return chapter, nil } -func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, error) { - log.Printf("Getting chapter %v by page %v\n", chapter.Id, page) +func (b *Bilinovel) getChapterByPage(pwPage playwright.Page, chapter *model.Chapter, pageNum int) (bool, error) { + log.Printf("Getting chapter %v by page %v\n", chapter.Id, pageNum) - Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", page) + Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", pageNum) hasNext := false headers := map[string]string{ @@ -344,8 +394,9 @@ func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, er } html := resp.Body() + // 解决乱序问题 - resortedHtml, err := b.processContentWithPlaywright(string(html)) + resortedHtml, err := b.processContentWithPlaywright(pwPage, string(html)) if err != nil { return false, fmt.Errorf("failed to process html: %w", err) } @@ -354,7 +405,7 @@ func (b *Bilinovel) getChapterByPage(chapter *model.Chapter, page int) (bool, er return false, fmt.Errorf("failed to parse html: %w", err) } - if page == 1 { + if pageNum == 1 { chapter.Title = doc.Find("#atitle").Text() } content := doc.Find("#acontent").First() @@ -445,7 +496,7 @@ func (b *Bilinovel) getImg(url string) ([]byte, error) { } // processContentWithPlaywright 使用复用的浏览器实例处理内容 -func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, error) { +func (b *Bilinovel) processContentWithPlaywright(page playwright.Page, htmlContent string) (string, error) { tempFile, err := os.CreateTemp("", "bilinovel-temp-*.html") if err != nil { return "", fmt.Errorf("failed to create temp file: %w", err) @@ -459,10 +510,10 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er tempFile.Close() tempFilePath := tempFile.Name() - _, err = b.page.ExpectResponse(func(url string) bool { + _, err = page.ExpectResponse(func(url string) bool { return strings.Contains(url, "chapterlog.js") }, func() error { - _, err = b.page.Goto("file://" + filepath.ToSlash(tempFilePath)) + _, err = page.Goto("file://" + filepath.ToSlash(tempFilePath)) if err != nil { return fmt.Errorf("could not navigate to file: %w", err) } @@ -474,7 +525,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er return "", fmt.Errorf("failed to wait for network request finish") } - err = b.page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{ + err = page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{ State: playwright.WaitForSelectorStateVisible, }) if err != nil { @@ -482,7 +533,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er } // 遍历所有 #acontent 的子元素, 通过 window.getComputedStyle().display 检测是否是 none, 如果是 none 则从页面删除这个元素 - result, err := b.page.Evaluate(` + result, err := page.Evaluate(` (function() { const acontent = document.getElementById('acontent'); if (!acontent) { @@ -497,7 +548,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er const element = elements[i]; const computedStyle = window.getComputedStyle(element); - if (computedStyle.display === 'none') { + if (computedStyle.display === 'none' || computedStyle.transform == 'matrix(0, 0, 0, 0, 0, 0)') { element.remove(); removedCount++; } @@ -513,7 +564,7 @@ func (b *Bilinovel) processContentWithPlaywright(htmlContent string) (string, er log.Printf("Hidden elements removal result: %s", result) - processedHTML, err := b.page.Content() + processedHTML, err := page.Content() if err != nil { return "", fmt.Errorf("could not get page content: %w", err) } diff --git a/downloader/downloader.go b/downloader/downloader.go new file mode 100644 index 0000000..a6e6e8c --- /dev/null +++ b/downloader/downloader.go @@ -0,0 +1,12 @@ +package downloader + +import "bilinovel-downloader/model" + +type Downloader interface { + GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error) + GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error) + GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) + GetStyleCSS() string + GetExtraFiles() []model.ExtraFile + Close() error +} diff --git a/epub/wrapper.go b/epub/wrapper.go index dc5a0a7..581570b 100644 --- a/epub/wrapper.go +++ b/epub/wrapper.go @@ -339,7 +339,7 @@ func addDirContentToZip(zipWriter *zip.Writer, dirPath string, method uint16) er return err } - relPath = filepath.ToSlash(relPath) + relPath = filepath.ToSlash(relPath) file, err := os.Open(filePath) if err != nil { diff --git a/model/downloader.go b/model/downloader.go deleted file mode 100644 index a245f1b..0000000 --- a/model/downloader.go +++ /dev/null @@ -1,16 +0,0 @@ -package model - -type ExtraFile struct { - Data []byte - Path string - ManifestItem ManifestItem -} - -type Downloader interface { - GetNovel(novelId int, skipChapter bool) (*Novel, error) - GetVolume(novelId int, volumeId int, skipChapter bool) (*Volume, error) - GetChapter(novelId int, volumeId int, chapterId int) (*Chapter, error) - GetStyleCSS() string - GetExtraFiles() []ExtraFile - Close() error -} diff --git a/model/epub.go b/model/epub.go index 965461d..5bff0d2 100644 --- a/model/epub.go +++ b/model/epub.go @@ -1,6 +1,14 @@ package model -import "encoding/xml" +import ( + "encoding/xml" +) + +type ExtraFile struct { + Data []byte + Path string + ManifestItem ManifestItem +} type DublinCoreMetadata struct { XMLName xml.Name `xml:"metadata"` diff --git a/test/bilinovel_test.go b/test/bilinovel_test.go index 67151a7..e186b3d 100644 --- a/test/bilinovel_test.go +++ b/test/bilinovel_test.go @@ -8,12 +8,12 @@ import ( ) func TestBilinovel_GetNovel(t *testing.T) { - bilinovel, err := bilinovel.New() + bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 5}) bilinovel.SetTextOnly(true) if err != nil { t.Fatalf("failed to create bilinovel: %v", err) } - novel, err := bilinovel.GetNovel(4519, false) + novel, err := bilinovel.GetNovel(2727, false, nil) if err != nil { t.Fatalf("failed to get novel: %v", err) } @@ -25,12 +25,12 @@ func TestBilinovel_GetNovel(t *testing.T) { } func TestBilinovel_GetVolume(t *testing.T) { - bilinovel, err := bilinovel.New() + bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 1}) bilinovel.SetTextOnly(true) if err != nil { t.Fatalf("failed to create bilinovel: %v", err) } - volume, err := bilinovel.GetVolume(1410, 52748, false) + volume, err := bilinovel.GetVolume(2727, 129092, false) if err != nil { t.Fatalf("failed to get volume: %v", err) } @@ -42,12 +42,12 @@ func TestBilinovel_GetVolume(t *testing.T) { } func TestBilinovel_GetChapter(t *testing.T) { - bilinovel, err := bilinovel.New() + bilinovel, err := bilinovel.New(bilinovel.BilinovelNewOption{Headless: false, Concurrency: 1}) bilinovel.SetTextOnly(true) if err != nil { t.Fatalf("failed to create bilinovel: %v", err) } - chapter, err := bilinovel.GetChapter(3095, 154930, 154933) + chapter, err := bilinovel.GetChapter(2727, 129092, 129094) if err != nil { t.Fatalf("failed to get chapter: %v", err) }