pcgamedb/crawler/1337x.go

183 lines
4.5 KiB
Go
Raw Normal View History

2024-09-24 06:17:11 -04:00
package crawler
import (
"bytes"
"errors"
"fmt"
"regexp"
"strconv"
"strings"
2024-11-20 06:09:04 -05:00
"pcgamedb/constant"
"pcgamedb/db"
"pcgamedb/model"
"pcgamedb/utils"
2024-09-24 06:17:11 -04:00
"github.com/PuerkitoBio/goquery"
"go.uber.org/zap"
)
type Formatter func(string) string
type s1337xCrawler struct {
source string
formatter Formatter
logger *zap.Logger
}
func New1337xCrawler(source string, formatter Formatter, logger *zap.Logger) *s1337xCrawler {
return &s1337xCrawler{
source: source,
formatter: formatter,
logger: logger,
}
}
func (c *s1337xCrawler) Crawl(page int) ([]*model.GameItem, error) {
2024-09-24 06:17:11 -04:00
var resp *utils.FetchResponse
var doc *goquery.Document
var err error
requestUrl := fmt.Sprintf("%s/%s/%d/", constant.C1337xBaseURL, c.source, page)
resp, err = utils.Fetch(utils.FetchConfig{
Url: requestUrl,
})
if err != nil {
return nil, err
}
doc, err = goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
if err != nil {
return nil, err
}
trSelection := doc.Find("tbody>tr")
urls := []string{}
trSelection.Each(func(i int, trNode *goquery.Selection) {
nameSelection := trNode.Find(".name").First()
if aNode := nameSelection.Find("a").Eq(1); aNode.Length() > 0 {
url, _ := aNode.Attr("href")
urls = append(urls, url)
}
})
var res []*model.GameItem
2024-09-24 06:17:11 -04:00
for _, u := range urls {
u = fmt.Sprintf("%s%s", constant.C1337xBaseURL, u)
if db.IsGameCrawledByURL(u) {
continue
}
c.logger.Info("Crawling", zap.String("URL", u))
item, err := c.CrawlByUrl(u)
if err != nil {
c.logger.Warn("Failed to crawl", zap.Error(err), zap.String("URL", u))
continue
}
err = db.SaveGameItem(item)
2024-09-24 06:17:11 -04:00
if err != nil {
c.logger.Warn("Failed to save", zap.Error(err), zap.String("URL", u))
continue
}
res = append(res, item)
info, err := OrganizeGameItem(item)
2024-09-24 06:17:11 -04:00
if err != nil {
c.logger.Warn("Failed to organize", zap.Error(err), zap.String("URL", u))
continue
}
err = db.SaveGameInfo(info)
if err != nil {
c.logger.Warn("Failed to save", zap.Error(err), zap.String("URL", u))
continue
}
}
return res, nil
}
func (c *s1337xCrawler) CrawlByUrl(url string) (*model.GameItem, error) {
2024-09-24 06:17:11 -04:00
resp, err := utils.Fetch(utils.FetchConfig{
Url: url,
})
if err != nil {
return nil, err
}
var item = &model.GameItem{}
2024-09-24 06:17:11 -04:00
item.Url = url
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
if err != nil {
return nil, err
}
selection := doc.Find(".torrent-detail-page ul.list>li")
info := make(map[string]string)
selection.Each(func(i int, item *goquery.Selection) {
info[strings.TrimSpace(item.Find("strong").Text())] = strings.TrimSpace(item.Find("span").Text())
})
magnetRegex := regexp.MustCompile(`magnet:\?[^"]*`)
magnetRegexRes := magnetRegex.FindStringSubmatch(string(resp.Data))
item.Size = info["Total size"]
item.RawName = doc.Find("title").Text()
item.RawName = strings.Replace(item.RawName, "Download ", "", 1)
item.RawName = strings.TrimSpace(strings.Replace(item.RawName, "Torrent | 1337x", " ", 1))
item.Name = c.formatter(item.RawName)
item.Download = magnetRegexRes[0]
item.Author = strings.Replace(c.source, "-torrents", "", -1)
return item, nil
}
func (c *s1337xCrawler) CrawlMulti(pages []int) (res []*model.GameItem, err error) {
var items []*model.GameItem
2024-09-24 06:17:11 -04:00
totalPageNum, err := c.GetTotalPageNum()
if err != nil {
return nil, err
}
for _, page := range pages {
if page > totalPageNum {
continue
}
items, err = c.Crawl(page)
res = append(res, items...)
if err != nil {
return nil, err
}
}
return res, nil
}
func (c *s1337xCrawler) CrawlAll() (res []*model.GameItem, err error) {
2024-09-24 06:17:11 -04:00
totalPageNum, err := c.GetTotalPageNum()
if err != nil {
return nil, err
}
var items []*model.GameItem
2024-09-24 06:17:11 -04:00
for i := 1; i <= totalPageNum; i++ {
items, err = c.Crawl(i)
res = append(res, items...)
if err != nil {
return nil, err
}
}
return res, nil
}
func (c *s1337xCrawler) GetTotalPageNum() (int, error) {
var resp *utils.FetchResponse
var doc *goquery.Document
var err error
requestUrl := fmt.Sprintf("%s/%s/%d/", constant.C1337xBaseURL, c.source, 1)
resp, err = utils.Fetch(utils.FetchConfig{
Url: requestUrl,
})
if err != nil {
return 0, err
}
doc, _ = goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
selection := doc.Find(".last")
pageStr, exist := selection.Find("a").Attr("href")
if !exist {
return 0, errors.New("total page num not found")
}
pageStr = strings.ReplaceAll(pageStr, c.source, "")
pageStr = strings.ReplaceAll(pageStr, "/", "")
totalPageNum, err := strconv.Atoi(pageStr)
if err != nil {
return 0, err
}
return totalPageNum, nil
}