256 lines
5.6 KiB
Go
256 lines
5.6 KiB
Go
package crawler
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/base64"
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"game-crawler/constant"
|
|
"game-crawler/db"
|
|
"game-crawler/model"
|
|
"game-crawler/utils"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type XatabCrawler struct {
|
|
*BaseLogger
|
|
*BaseError
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewXatabCrawler(logger *zap.Logger) *XatabCrawler {
|
|
return &XatabCrawler{
|
|
BaseLogger: &BaseLogger{
|
|
logger: logger,
|
|
},
|
|
BaseError: &BaseError{},
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
func (c *XatabCrawler) Name() string {
|
|
return "XatabCrawler"
|
|
}
|
|
|
|
func (c *XatabCrawler) Crawl(page int) ([]*model.GameItem, error) {
|
|
c.LogCrawlStart(page)
|
|
|
|
requestURL := fmt.Sprintf("%s/page/%v", constant.XatabBaseURL, page)
|
|
|
|
resp, err := utils.Request().SetLogger(c.logger.Sugar()).Get(requestURL)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(requestURL, err)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return nil, c.ErrParseDoc(requestURL, err)
|
|
}
|
|
|
|
var urls []string
|
|
var updateFlags []string
|
|
|
|
doc.Find(".entry").Each(func(i int, s *goquery.Selection) {
|
|
u, exists := s.Find(".entry__title.h2 a").Attr("href")
|
|
if !exists {
|
|
return
|
|
}
|
|
urls = append(urls, u)
|
|
updateFlags = append(updateFlags, base64.StdEncoding.EncodeToString([]byte(s.Find(".entry__title.h2 a").Text())))
|
|
})
|
|
|
|
var res []*model.GameItem
|
|
for i, u := range urls {
|
|
if db.IsXatabCrawled(updateFlags[i]) {
|
|
c.LogCrawlSkip(u)
|
|
continue
|
|
}
|
|
|
|
item, err := c.CrawlByUrl(u)
|
|
if err != nil {
|
|
c.LogCrawlByUrlError(u, err)
|
|
continue
|
|
}
|
|
|
|
item.UpdateFlag = updateFlags[i]
|
|
if err := db.SaveGameItem(item); err != nil {
|
|
c.LogSaveGameItemError(u, err)
|
|
continue
|
|
}
|
|
|
|
res = append(res, item)
|
|
|
|
if err := OrganizeGameItem(item); err != nil {
|
|
c.LogOrganizeGameWarn(item.ID, item.Name, err)
|
|
continue
|
|
}
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (c *XatabCrawler) CrawlByUrl(URL string) (*model.GameItem, error) {
|
|
c.LogCrawlByUrlStart(URL)
|
|
|
|
resp, err := utils.Request().SetLogger(c.logger.Sugar()).Get(URL)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(URL, err)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return nil, c.ErrParseDoc(URL, err)
|
|
}
|
|
|
|
item, err := db.GetGameItemByUrl(URL)
|
|
if err == nil {
|
|
return nil, c.ErrDBQuery(err)
|
|
}
|
|
|
|
item.Url = URL
|
|
item.RawName = strings.TrimSpace(doc.Find(".inner-entry__title").First().Text())
|
|
if item.RawName == "" {
|
|
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to get game name"))
|
|
}
|
|
item.Name = XatabFormatter(item.RawName)
|
|
item.Author = "Xatab"
|
|
item.Platform = "windows"
|
|
|
|
downloadURL := doc.Find("#download>a").First().AttrOr("href", "")
|
|
if downloadURL == "" {
|
|
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to get download link"))
|
|
}
|
|
|
|
resp, err = utils.Request().SetLogger(c.logger.Sugar()).SetHeaders(map[string]string{"Referer": URL}).Get(downloadURL)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(downloadURL, err)
|
|
}
|
|
|
|
magnet, size, err := utils.ConvertTorrentToMagnet(resp.Body())
|
|
if err != nil {
|
|
return nil, c.ErrConvertTorrent(err)
|
|
}
|
|
|
|
item.Size = size
|
|
item.Downloads = map[string]string{
|
|
"magnet": magnet,
|
|
}
|
|
|
|
return item, nil
|
|
}
|
|
|
|
// CrawlMulti crawls multiple pages from Xatab.
|
|
func (c *XatabCrawler) CrawlMulti(pages []int) ([]*model.GameItem, error) {
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
if err != nil {
|
|
return nil, c.ErrGetTotalPageNum(err)
|
|
}
|
|
|
|
var res []*model.GameItem
|
|
for _, page := range pages {
|
|
if page > totalPageNum || page < 1 {
|
|
c.LogPageExceedWarn(page)
|
|
continue
|
|
}
|
|
|
|
items, err := c.Crawl(page)
|
|
if err != nil {
|
|
c.LogCrawlError(page, err)
|
|
continue
|
|
}
|
|
|
|
res = append(res, items...)
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (c *XatabCrawler) CrawlAll() ([]*model.GameItem, error) {
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
if err != nil {
|
|
return nil, c.ErrGetTotalPageNum(err)
|
|
}
|
|
|
|
var res []*model.GameItem
|
|
for i := 1; i <= totalPageNum; i++ {
|
|
items, err := c.Crawl(i)
|
|
if err != nil {
|
|
c.LogCrawlError(i, err)
|
|
continue
|
|
}
|
|
|
|
res = append(res, items...)
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (c *XatabCrawler) GetTotalPageNum() (int, error) {
|
|
resp, err := utils.Request().SetLogger(c.logger.Sugar()).Get(constant.XatabBaseURL)
|
|
if err != nil {
|
|
return 0, c.ErrRequest(constant.XatabBaseURL, err)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return 0, c.ErrParseDoc(constant.XatabBaseURL, err)
|
|
}
|
|
|
|
pageStr := doc.Find(".pagination>a").Last().Text()
|
|
totalPageNum, err := strconv.Atoi(pageStr)
|
|
if err != nil {
|
|
return 0, c.ErrParseInt(pageStr, err)
|
|
}
|
|
|
|
return totalPageNum, nil
|
|
}
|
|
|
|
func XatabFormatter(name string) string {
|
|
reg1 := regexp.MustCompile(`(?i)v(er)?\s?(\.)?\d+(\.\d+)*`)
|
|
if index := reg1.FindIndex([]byte(name)); index != nil {
|
|
name = name[:index[0]]
|
|
}
|
|
if index := strings.Index(name, "["); index != -1 {
|
|
name = name[:index]
|
|
}
|
|
if index := strings.Index(name, "("); index != -1 {
|
|
name = name[:index]
|
|
}
|
|
if index := strings.Index(name, "{"); index != -1 {
|
|
name = name[:index]
|
|
}
|
|
if index := strings.Index(name, "+"); index != -1 {
|
|
name = name[:index]
|
|
}
|
|
name = strings.TrimSpace(name)
|
|
|
|
for _, re := range xatabRegexps {
|
|
name = re.ReplaceAllString(name, "")
|
|
}
|
|
|
|
if index := strings.Index(name, "/"); index != -1 {
|
|
names := strings.Split(name, "/")
|
|
longestLength := 0
|
|
longestName := ""
|
|
for _, n := range names {
|
|
if !utils.ContainsRussian(n) && len(n) > longestLength {
|
|
longestLength = len(n)
|
|
longestName = n
|
|
}
|
|
}
|
|
name = longestName
|
|
}
|
|
|
|
return strings.TrimSpace(name)
|
|
}
|
|
|
|
var xatabRegexps = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\sPC$`),
|
|
}
|