166 lines
4.0 KiB
Go
166 lines
4.0 KiB
Go
package crawler
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"pcgamedb/constant"
|
|
"pcgamedb/db"
|
|
"pcgamedb/model"
|
|
"pcgamedb/utils"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type FitGirlCrawler struct {
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewFitGirlCrawler(logger *zap.Logger) *FitGirlCrawler {
|
|
return &FitGirlCrawler{
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
func (c *FitGirlCrawler) Name() string {
|
|
return "FitGirlCrawler"
|
|
}
|
|
|
|
func (c *FitGirlCrawler) CrawlByUrl(URL string) (*model.GameItem, error) {
|
|
resp, err := utils.Request().Get(URL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
titleElem := doc.Find("h3").First().Find("strong")
|
|
if titleElem.Length() == 0 {
|
|
return nil, errors.New("failed to find title")
|
|
}
|
|
rawTitle := titleElem.Text()
|
|
titleElem.Children().Remove()
|
|
title := strings.TrimSpace(titleElem.Text())
|
|
sizeRegex := regexp.MustCompile(`Repack Size: <strong>(.*?)</strong>`)
|
|
sizeRegexRes := sizeRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(sizeRegexRes) == 0 {
|
|
return nil, errors.New("failed to find size")
|
|
}
|
|
size := sizeRegexRes[1]
|
|
magnetRegex := regexp.MustCompile(`magnet:\?[^"]*`)
|
|
magnetRegexRes := magnetRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(magnetRegexRes) == 0 {
|
|
return nil, errors.New("failed to find magnet")
|
|
}
|
|
magnet := magnetRegexRes[0]
|
|
item, err := db.GetGameItemByUrl(URL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
item.Name = strings.TrimSpace(title)
|
|
item.RawName = rawTitle
|
|
item.Url = URL
|
|
item.Size = size
|
|
item.Author = "FitGirl"
|
|
item.Download = magnet
|
|
item.Platform = "windows"
|
|
return item, nil
|
|
}
|
|
|
|
func (c *FitGirlCrawler) Crawl(page int) ([]*model.GameItem, error) {
|
|
resp, err := utils.Request().Get(fmt.Sprintf(constant.FitGirlURL, page))
|
|
if err != nil {
|
|
c.logger.Error("Failed to fetch", zap.Error(err))
|
|
return nil, err
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
c.logger.Error("Failed to parse HTML", zap.Error(err))
|
|
return nil, err
|
|
}
|
|
var urls []string
|
|
var updateFlags []string //link+date
|
|
doc.Find("article").Each(func(i int, s *goquery.Selection) {
|
|
u, exist1 := s.Find(".entry-title>a").First().Attr("href")
|
|
d, exist2 := s.Find("time").First().Attr("datetime")
|
|
if exist1 && exist2 {
|
|
urls = append(urls, u)
|
|
updateFlags = append(updateFlags, fmt.Sprintf("%s%s", u, d))
|
|
}
|
|
})
|
|
var res []*model.GameItem
|
|
for i, u := range urls {
|
|
if db.IsFitgirlCrawled(updateFlags[i]) {
|
|
continue
|
|
}
|
|
c.logger.Info("Crawling", zap.String("URL", u))
|
|
item, err := c.CrawlByUrl(u)
|
|
if err != nil {
|
|
c.logger.Warn("Failed to crawl", zap.Error(err), zap.String("URL", u))
|
|
continue
|
|
}
|
|
item.UpdateFlag = updateFlags[i]
|
|
err = db.SaveGameItem(item)
|
|
if err != nil {
|
|
c.logger.Warn("Failed to save", zap.Error(err))
|
|
continue
|
|
}
|
|
res = append(res, item)
|
|
if err := OrganizeGameItem(item); err != nil {
|
|
c.logger.Warn("Failed to organize", zap.Error(err), zap.String("URL", u))
|
|
continue
|
|
}
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (c *FitGirlCrawler) CrawlMulti(pages []int) ([]*model.GameItem, error) {
|
|
var res []*model.GameItem
|
|
for _, page := range pages {
|
|
items, err := c.Crawl(page)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
res = append(res, items...)
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (c *FitGirlCrawler) CrawlAll() ([]*model.GameItem, error) {
|
|
var res []*model.GameItem
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for i := 1; i <= totalPageNum; i++ {
|
|
items, err := c.Crawl(i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
res = append(res, items...)
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (c *FitGirlCrawler) GetTotalPageNum() (int, error) {
|
|
resp, err := utils.Request().Get(fmt.Sprintf(constant.FitGirlURL, 1))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
page, err := strconv.Atoi(doc.Find(".page-numbers.dots").First().Next().Text())
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return page, nil
|
|
}
|