bilinovel-downloader/downloader/bilinovel/bilinovel.go

package bilinovel

import (
	"bilinovel-downloader/model"
	"bilinovel-downloader/utils"
	"bytes"
	"crypto/sha256"
	_ "embed"
	"fmt"
	"log/slog"
	"net/http"
	"os"
	"path"
	"path/filepath"
	"regexp"
	"slices"
	"strconv"
	"strings"
	"sync"

	"github.com/PuerkitoBio/goquery"
	mapper "github.com/bestnite/font-mapper"
	"github.com/playwright-community/playwright-go"
)

//go:embed read.ttf
var readTTF []byte

//go:embed "MI LANTING.ttf"
var miLantingTTF []byte

type Bilinovel struct {
	fontMapper  *mapper.GlyphOutlineMapper
	textOnly    bool
	restyClient *utils.RestyClient

	// 浏览器实例复用
	browser        playwright.Browser
	browserContext playwright.BrowserContext
	pages          map[string]playwright.Page
	concurrency    int
	concurrentChan chan any

	logger *slog.Logger
}

type BilinovelNewOption struct {
	Concurrency int
	Debug       bool
}

func New(option BilinovelNewOption) (*Bilinovel, error) {
	fontMapper, err := mapper.NewGlyphOutlineMapper(readTTF, miLantingTTF)
	if err != nil {
		return nil, fmt.Errorf("failed to create font mapper: %v", err)
	}
	restyClient := utils.NewRestyClient(50)

	var logLevel slog.Level
	if option.Debug {
		logLevel = slog.LevelDebug
	} else {
		logLevel = slog.LevelInfo
	}

	handlerOptions := &slog.HandlerOptions{
		Level: logLevel,
	}

	b := &Bilinovel{
		fontMapper:     fontMapper,
		textOnly:       false,
		restyClient:    restyClient,
		pages:          make(map[string]playwright.Page),
		concurrency:    option.Concurrency,
		concurrentChan: make(chan any, option.Concurrency),
		logger:         slog.New(slog.NewTextHandler(os.Stdout, handlerOptions)),
	}

	// 初始化浏览器实例
	err = b.initBrowser(option.Debug)
	if err != nil {
		return nil, fmt.Errorf("failed to init browser: %v", err)
	}

	return b, nil
}

func (b *Bilinovel) SetTextOnly(textOnly bool) {
	b.textOnly = textOnly
}

func (b *Bilinovel) GetExtraFiles() []model.ExtraFile {
	return nil
}

// initBrowser 初始化浏览器实例
func (b *Bilinovel) initBrowser(debug bool) error {
	pw, err := playwright.Run()
	if err != nil {
		return fmt.Errorf("could not start playwright: %w", err)
	}

	b.browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
		Headless: playwright.Bool(!debug),
		Devtools: playwright.Bool(debug),
	})
	if err != nil {
		return fmt.Errorf("could not launch browser: %w", err)
	}

	b.browserContext, err = b.browser.NewContext()
	if err != nil {
		return fmt.Errorf("could not create browser context: %w", err)
	}

	b.logger.Info("Browser initialized successfully")
	return nil
}

// Close 清理资源
func (b *Bilinovel) Close() error {
	if b.browser != nil {
		if err := b.browser.Close(); err != nil {
			b.logger.Error("could not close browser", slog.Any("error", err))
		}
		b.browser = nil
		b.browserContext = nil
	}
	return nil
}

//go:embed style.css
var styleCSS []byte

func (b *Bilinovel) GetStyleCSS() string {
	return string(styleCSS)
}

func (b *Bilinovel) GetNovel(novelId int, skipChapterContent bool, skipVolumes []int) (*model.Novel, error) {
	b.logger.Info("Getting novel", slog.Int("novelId", novelId))

	novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v.html", novelId)
	resp, err := b.restyClient.R().Get(novelUrl)
	if err != nil {
		return nil, fmt.Errorf("failed to get novel info: %w", err)
	}
	if resp.StatusCode() != http.StatusOK {
		return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
	}

	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
	if err != nil {
		return nil, fmt.Errorf("failed to parse html: %v", err)
	}

	novel := &model.Novel{}

	novel.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
	novel.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
	novel.Id = novelId

	doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
		novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
	})
	doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
		novel.Authors = append(novel.Authors, strings.TrimSpace(s.Text()))
	})

	volumes, err := b.getAllVolumes(novelId, skipChapterContent, skipVolumes)
	if err != nil {
		return nil, fmt.Errorf("failed to get novel volumes: %v", err)
	}
	novel.Volumes = volumes

	return novel, nil
}

func (b *Bilinovel) GetVolume(novelId int, volumeId int, skipChapterContent bool) (*model.Volume, error) {
	b.logger.Info("Getting volume of novel", slog.Int("volumeId", volumeId), slog.Int("novelId", novelId))

	novelUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
	resp, err := b.restyClient.R().Get(novelUrl)
	if err != nil {
		return nil, fmt.Errorf("failed to get novel info: %w", err)
	}
	if resp.StatusCode() != http.StatusOK {
		return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
	}

	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
	if err != nil {
		return nil, fmt.Errorf("failed to parse html: %v", err)
	}

	seriesIdx := 0
	doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
		if s.AttrOr("href", "") == fmt.Sprintf("/novel/%v/vol_%v.html", novelId, volumeId) {
			seriesIdx = i + 1
		}
	})

	novelTitle := strings.TrimSpace(doc.Find(".book-title").First().Text())

	if seriesIdx == 0 {
		return nil, fmt.Errorf("volume not found: %v", volumeId)
	}

	volumeUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/vol_%v.html", novelId, volumeId)
	resp, err = b.restyClient.R().Get(volumeUrl)
	if err != nil {
		return nil, fmt.Errorf("failed to get novel info: %v", err)
	}
	if resp.StatusCode() != http.StatusOK {
		return nil, fmt.Errorf("failed to get novel info: %v", resp.Status())
	}

	doc, err = goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
	if err != nil {
		return nil, fmt.Errorf("failed to parse html: %v", err)
	}

	volume := &model.Volume{}
	volume.NovelId = novelId
	volume.NovelTitle = novelTitle
	volume.Id = volumeId
	volume.SeriesIdx = seriesIdx
	volume.Title = strings.TrimSpace(doc.Find(".book-title").First().Text())
	volume.Description = strings.TrimSpace(doc.Find(".book-summary>content").First().Text())
	volume.Url = volumeUrl
	volume.Chapters = make([]*model.Chapter, 0)
	volume.CoverUrl = doc.Find(".book-cover").First().AttrOr("src", "")
	cover, err := b.getImg(volume.CoverUrl)
	if err != nil {
		return nil, fmt.Errorf("failed to get cover: %v", err)
	}
	volume.Cover = cover

	doc.Find(".authorname>a").Each(func(i int, s *goquery.Selection) {
		volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
	})
	doc.Find(".illname>a").Each(func(i int, s *goquery.Selection) {
		volume.Authors = append(volume.Authors, strings.TrimSpace(s.Text()))
	})
	doc.Find(".chapter-li.jsChapter").Each(func(i int, s *goquery.Selection) {
		volume.Chapters = append(volume.Chapters, &model.Chapter{
			Title: s.Find("a").Text(),
			Url:   fmt.Sprintf("https://www.bilinovel.com%v", s.Find("a").AttrOr("href", "")),
		})
	})

	idRegexp := regexp.MustCompile(`/novel/(\d+)/(\d+).html`)

	if !skipChapterContent {
		for i := range volume.Chapters {
			matches := idRegexp.FindStringSubmatch(volume.Chapters[i].Url)
			if len(matches) > 0 {
				chapterId, err := strconv.Atoi(matches[2])
				if err != nil {
					return nil, fmt.Errorf("failed to convert chapter id: %v", err)
				}
				chapter, err := b.GetChapter(novelId, volumeId, chapterId)
				if err != nil {
					return nil, fmt.Errorf("failed to get chapter: %v", err)
				}
				chapter.Id = chapterId
				volume.Chapters[i] = chapter
			} else {
				return nil, fmt.Errorf("failed to get chapter id: %v", volume.Chapters[i].Url)
			}
		}
	}

	return volume, nil
}

func (b *Bilinovel) getAllVolumes(novelId int, skipChapterContent bool, skipVolumes []int) ([]*model.Volume, error) {
	b.logger.Info("Getting all volumes of novel", slog.Int("novelId", novelId))

	catelogUrl := fmt.Sprintf("https://www.bilinovel.com/novel/%v/catalog", novelId)
	resp, err := b.restyClient.R().Get(catelogUrl)
	if err != nil {
		return nil, fmt.Errorf("failed to get catelog: %v", err)
	}
	if resp.StatusCode() != http.StatusOK {
		return nil, fmt.Errorf("failed to get catelog: %v", resp.Status())
	}

	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
	if err != nil {
		return nil, fmt.Errorf("failed to parse html: %v", err)
	}

	volumeRegexp := regexp.MustCompile(fmt.Sprintf(`/novel/%v/vol_(\d+).html`, novelId))

	volumeIds := make([]string, 0)
	doc.Find("a.volume-cover-img").Each(func(i int, s *goquery.Selection) {
		link := s.AttrOr("href", "")
		matches := volumeRegexp.FindStringSubmatch(link)
		if len(matches) > 0 {
			volumeIds = append(volumeIds, matches[1])
		}
	})

	volumes := make([]*model.Volume, len(volumeIds))
	var wg sync.WaitGroup
	var mu sync.Mutex // 保护 volumes 写入的互斥锁

	for i, volumeIdStr := range volumeIds {
		wg.Add(1)
		b.concurrentChan <- struct{}{} // 获取一个并发槽

		go func(i int, volumeIdStr string) {
			defer wg.Done()
			defer func() { <-b.concurrentChan }() // 释放并发槽

			volumeId, err := strconv.Atoi(volumeIdStr)
			if err != nil {
				b.logger.Error("failed to convert volume id", slog.String("volumeIdStr", volumeIdStr), slog.Any("error", err))
				return
			}
			if slices.Contains(skipVolumes, volumeId) {
				return
			}
			volume, err := b.GetVolume(novelId, volumeId, skipChapterContent)
			if err != nil {
				b.logger.Error("failed to get volume info", slog.Int("novelId", novelId), slog.Int("volumeId", volumeId), slog.Any("error", err))
				return
			}
			volume.SeriesIdx = i

			// 关闭浏览器标签页
			pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
			if pwPage, ok := b.pages[pwPageKey]; ok {
				_ = pwPage.Close()
				delete(b.pages, pwPageKey)
			}

			mu.Lock()
			volumes[i] = volume
			mu.Unlock()
		}(i, volumeIdStr)
	}

	wg.Wait()

	// 过滤掉获取失败的 nil volume
	filteredVolumes := make([]*model.Volume, 0, len(volumes))
	for _, vol := range volumes {
		if vol != nil {
			filteredVolumes = append(filteredVolumes, vol)
		}
	}

	return filteredVolumes, nil
}

func (b *Bilinovel) GetChapter(novelId int, volumeId int, chapterId int) (*model.Chapter, error) {
	b.logger.Info("Getting chapter of novel", slog.Int("chapterId", chapterId), slog.Int("novelId", novelId))

	pageNum := 1
	chapter := &model.Chapter{
		Id:       chapterId,
		NovelId:  novelId,
		VolumeId: volumeId,
		Url:      fmt.Sprintf("https://www.bilinovel.com/novel/%v/%v.html", novelId, chapterId),
	}
	for {
		pwPageKey := fmt.Sprintf("%v-%v", novelId, volumeId)
		if _, ok := b.pages[pwPageKey]; !ok {
			pwPage, err := b.browserContext.NewPage()
			if err != nil {
				return nil, fmt.Errorf("failed to create browser page: %w", err)
			}
			b.pages[pwPageKey] = pwPage
		}
		hasNext, err := b.getChapterByPage(b.pages[pwPageKey], chapter, pageNum)
		if err != nil {
			return nil, fmt.Errorf("failed to download chapter: %w", err)
		}
		if !hasNext {
			break
		}
		pageNum++
	}
	return chapter, nil
}

var nextPageUrlRegexp = regexp.MustCompile(`url_next:\s?['"]([^'"]*?)['"]`)
var cleanNextPageUrlRegexp = regexp.MustCompile(`(_\d+)?\.html$`)

func (b *Bilinovel) getChapterByPage(pwPage playwright.Page, chapter *model.Chapter, pageNum int) (bool, error) {
	b.logger.Info("Getting chapter by page", slog.Int("chapter", chapter.Id), slog.Int("page", pageNum))

	Url := strings.TrimSuffix(chapter.Url, ".html") + fmt.Sprintf("_%v.html", pageNum)

	hasNext := false
	headers := map[string]string{
		"Accept":          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
		"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,zh-TW;q=0.6",
		"Cookie":          "night=1;",
	}
	resp, err := b.restyClient.R().SetHeaders(headers).Get(Url)
	if err != nil {
		return false, fmt.Errorf("failed to get chapter: %w", err)
	}
	if resp.StatusCode() != http.StatusOK {
		return false, fmt.Errorf("failed to get chapter: %v", resp.Status())
	}

	if strings.Contains(resp.String(), `<a onclick="window.location.href = ReadParams.url_next;">下一頁</a>`) {
		hasNext = true
	}

	html := resp.Body()

	// 解决乱序问题
	resortedHtml, err := b.processContentWithPlaywright(pwPage, string(html))
	if err != nil {
		return false, fmt.Errorf("failed to process html: %w", err)
	}
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(resortedHtml))
	if err != nil {
		return false, fmt.Errorf("failed to parse html: %w", err)
	}

	// 判断章节是否有下一页
	n := nextPageUrlRegexp.FindStringSubmatch(resortedHtml)
	if len(n) != 2 {
		return false, fmt.Errorf("failed to determine wether there is a next page")
	}

	s := cleanNextPageUrlRegexp.ReplaceAllString(n[1], "")
	if strings.Contains(Url, s) {
		hasNext = true
	}

	if pageNum == 1 {
		chapter.Title = doc.Find("#atitle").Text()
	}
	content := doc.Find("#acontent").First()
	content.Find(".cgo").Remove()
	content.Find("center").Remove()
	content.Find(".google-auto-placed").Remove()

	if strings.Contains(resortedHtml, `font-family: "read"`) {
		html, err := content.Find("p").Last().Html()
		if err != nil {
			return false, fmt.Errorf("failed to get html: %v", err)
		}
		builder := strings.Builder{}
		for _, r := range html {
			_, newRune, ok := b.fontMapper.MappingRune(r)
			if ok {
				builder.WriteRune(newRune)
			}
		}
		content.Find("p").Last().SetHtml(builder.String())
	}

	if b.textOnly {
		content.Find("img").Remove()
	} else {
		content.Find("img").Each(func(i int, s *goquery.Selection) {
			imgUrl := s.AttrOr("data-src", "")
			if imgUrl == "" {
				imgUrl = s.AttrOr("src", "")
				if imgUrl == "" {
					return
				}
			}

			imageHash := sha256.Sum256([]byte(imgUrl))
			imageFilename := fmt.Sprintf("%x%s", string(imageHash[:]), path.Ext(imgUrl))
			s.SetAttr("src", imageFilename)
			s.SetAttr("alt", imgUrl)
			s.RemoveAttr("class")
			img, err := b.getImg(imgUrl)
			if err != nil {
				return
			}
			if chapter.Content == nil {
				chapter.Content = &model.ChaperContent{}
			}
			if chapter.Content.Images == nil {
				chapter.Content.Images = make(map[string][]byte)
			}
			chapter.Content.Images[imageFilename] = img
		})
	}

	doc.Find("*").Each(func(i int, s *goquery.Selection) {
		if len(s.Nodes) > 0 && len(s.Nodes[0].Attr) > 0 {
			// 遍历元素的所有属性
			for _, attr := range s.Nodes[0].Attr {
				// 3. 检查属性名是否以 "data-k" 开头，且属性值是否为空
				if strings.HasPrefix(attr.Key, "data-k") {
					// 4. 如果满足条件，就移除这个属性
					s.RemoveAttr(attr.Key)
				}
			}
		}
	})

	htmlStr, err := content.Html()
	if err != nil {
		return false, fmt.Errorf("failed to get html: %v", err)
	}

	if chapter.Content == nil {
		chapter.Content = &model.ChaperContent{}
	}
	chapter.Content.Html += strings.TrimSpace(htmlStr)

	return hasNext, nil
}

func (b *Bilinovel) getImg(url string) ([]byte, error) {
	b.logger.Info("Getting img", slog.String("url", url))
	resp, err := b.restyClient.R().SetHeader("Referer", "https://www.bilinovel.com").Get(url)
	if err != nil {
		return nil, err
	}

	return resp.Body(), nil
}

// processContentWithPlaywright 使用复用的浏览器实例处理内容
func (b *Bilinovel) processContentWithPlaywright(page playwright.Page, htmlContent string) (string, error) {
	// 替换 window.location.replace，防止页面跳转
	htmlContent = strings.ReplaceAll(htmlContent, "window.location.replace", "console.log")

	tempPath := filepath.Join(os.TempDir(), "bilinovel-downloader")
	err := os.MkdirAll(tempPath, 0755)
	if err != nil {
		return "", fmt.Errorf("failed to create temp dir: %w", err)
	}
	tempFile, err := os.CreateTemp(tempPath, "temp-*.html")
	if err != nil {
		return "", fmt.Errorf("failed to create temp file: %w", err)
	}
	defer os.Remove(tempFile.Name())

	_, err = tempFile.WriteString(htmlContent)
	if err != nil {
		return "", fmt.Errorf("failed to write temp file: %w", err)
	}
	tempFile.Close()
	tempFilePath := tempFile.Name()

	// // 屏蔽请求
	// googleAdsDomains := []string{
	// 	"adtrafficquality.google",
	// 	"doubleclick.net",
	// 	"googlesyndication.com",
	// 	"googletagmanager.com",
	// 	"hm.baidu.com",
	// 	"cloudflareinsights.com",
	// 	"fsdoa.js",                         // adblock 检测
	// 	"https://www.linovelib.com/novel/", // 阻止从本地文件跳转到在线页面
	// }
	// err = page.Route("**/*", func(route playwright.Route) {
	// 	for _, d := range googleAdsDomains {
	// 		if strings.Contains(route.Request().URL(), d) {
	// 			b.logger.Debug("blocking request", slog.String("url", route.Request().URL()))
	// 			err := route.Abort("aborted")
	// 			if err != nil {
	// 				b.logger.Debug("failed to block request", route.Request().URL(), err)
	// 			}
	// 			return
	// 		}
	// 	}
	// 	_ = route.Continue()
	// })
	// if err != nil {
	// 	return "", fmt.Errorf("failed to intercept requests: %w", err)
	// }

	_, err = page.ExpectResponse(func(url string) bool {
		return strings.Contains(url, "chapterlog.js")
	}, func() error {
		_, err = page.Goto("file://" + filepath.ToSlash(tempFilePath))
		if err != nil {
			return fmt.Errorf("could not navigate to file: %w", err)
		}
		return nil
	}, playwright.PageExpectResponseOptions{
		Timeout: playwright.Float(10000),
	})
	if err != nil {
		return "", fmt.Errorf("failed to wait for network request finish")
	}

	err = page.Locator("#acontent").WaitFor(playwright.LocatorWaitForOptions{
		State:   playwright.WaitForSelectorStateVisible,
		Timeout: playwright.Float(10000),
	})
	if err != nil {
		return "", fmt.Errorf("could not wait for #acontent: %w", err)
	}

	// 遍历所有 #acontent 的子元素, 通过 window.getComputedStyle().display 检测是否是 none, 如果是 none 则从页面删除这个元素
	result, err := page.Evaluate(`
		(function() {
			const acontent = document.getElementById('acontent');
			if (!acontent) {
				return 'acontent element not found';
			}

			let removedCount = 0;
			const elements = acontent.querySelectorAll('*');

			// 从后往前遍历，避免删除元素时影响索引
			for (let i = elements.length - 1; i >= 0; i--) {
				const element = elements[i];
				const computedStyle = window.getComputedStyle(element);

				if (computedStyle.display === 'none' || computedStyle.transform == 'matrix(0, 0, 0, 0, 0, 0)') {
					element.remove();
					removedCount++;
				}
			}

			return 'Removed ' + removedCount + ' hidden elements';
		})()
	`)

	if err != nil {
		return "", fmt.Errorf("failed to remove hidden elements: %w", err)
	}

	b.logger.Debug("Hidden elements removal result", slog.Any("count", result))

	processedHTML, err := page.Content()
	if err != nil {
		return "", fmt.Errorf("could not get page content: %w", err)
	}

	return processedHTML, nil
}