2024-09-24 06:17:11 -04:00
|
|
|
package crawler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
2024-11-15 02:02:45 -05:00
|
|
|
"github.com/nitezs/pcgamedb/constant"
|
|
|
|
"github.com/nitezs/pcgamedb/db"
|
|
|
|
"github.com/nitezs/pcgamedb/model"
|
|
|
|
"github.com/nitezs/pcgamedb/utils"
|
2024-09-24 06:17:11 -04:00
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
)
|
|
|
|
|
|
|
|
type Formatter func(string) string
|
|
|
|
|
|
|
|
type s1337xCrawler struct {
|
|
|
|
source string
|
|
|
|
formatter Formatter
|
|
|
|
logger *zap.Logger
|
|
|
|
}
|
|
|
|
|
|
|
|
func New1337xCrawler(source string, formatter Formatter, logger *zap.Logger) *s1337xCrawler {
|
|
|
|
return &s1337xCrawler{
|
|
|
|
source: source,
|
|
|
|
formatter: formatter,
|
|
|
|
logger: logger,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-16 00:48:48 -05:00
|
|
|
func (c *s1337xCrawler) Crawl(page int) ([]*model.GameItem, error) {
|
2024-09-24 06:17:11 -04:00
|
|
|
var resp *utils.FetchResponse
|
|
|
|
var doc *goquery.Document
|
|
|
|
var err error
|
|
|
|
requestUrl := fmt.Sprintf("%s/%s/%d/", constant.C1337xBaseURL, c.source, page)
|
|
|
|
resp, err = utils.Fetch(utils.FetchConfig{
|
|
|
|
Url: requestUrl,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
doc, err = goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
trSelection := doc.Find("tbody>tr")
|
|
|
|
urls := []string{}
|
|
|
|
trSelection.Each(func(i int, trNode *goquery.Selection) {
|
|
|
|
nameSelection := trNode.Find(".name").First()
|
|
|
|
if aNode := nameSelection.Find("a").Eq(1); aNode.Length() > 0 {
|
|
|
|
url, _ := aNode.Attr("href")
|
|
|
|
urls = append(urls, url)
|
|
|
|
}
|
|
|
|
})
|
2024-11-16 00:48:48 -05:00
|
|
|
var res []*model.GameItem
|
2024-09-24 06:17:11 -04:00
|
|
|
for _, u := range urls {
|
|
|
|
u = fmt.Sprintf("%s%s", constant.C1337xBaseURL, u)
|
|
|
|
if db.IsGameCrawledByURL(u) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
c.logger.Info("Crawling", zap.String("URL", u))
|
|
|
|
item, err := c.CrawlByUrl(u)
|
|
|
|
if err != nil {
|
|
|
|
c.logger.Warn("Failed to crawl", zap.Error(err), zap.String("URL", u))
|
|
|
|
continue
|
|
|
|
}
|
2024-11-16 00:48:48 -05:00
|
|
|
err = db.SaveGameItem(item)
|
2024-09-24 06:17:11 -04:00
|
|
|
if err != nil {
|
|
|
|
c.logger.Warn("Failed to save", zap.Error(err), zap.String("URL", u))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
res = append(res, item)
|
2024-11-16 00:48:48 -05:00
|
|
|
info, err := OrganizeGameItem(item)
|
2024-09-24 06:17:11 -04:00
|
|
|
if err != nil {
|
|
|
|
c.logger.Warn("Failed to organize", zap.Error(err), zap.String("URL", u))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
err = db.SaveGameInfo(info)
|
|
|
|
if err != nil {
|
|
|
|
c.logger.Warn("Failed to save", zap.Error(err), zap.String("URL", u))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
2024-11-16 00:48:48 -05:00
|
|
|
func (c *s1337xCrawler) CrawlByUrl(url string) (*model.GameItem, error) {
|
2024-09-24 06:17:11 -04:00
|
|
|
resp, err := utils.Fetch(utils.FetchConfig{
|
|
|
|
Url: url,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-11-16 00:48:48 -05:00
|
|
|
var item = &model.GameItem{}
|
2024-09-24 06:17:11 -04:00
|
|
|
item.Url = url
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
selection := doc.Find(".torrent-detail-page ul.list>li")
|
|
|
|
info := make(map[string]string)
|
|
|
|
selection.Each(func(i int, item *goquery.Selection) {
|
|
|
|
info[strings.TrimSpace(item.Find("strong").Text())] = strings.TrimSpace(item.Find("span").Text())
|
|
|
|
})
|
|
|
|
magnetRegex := regexp.MustCompile(`magnet:\?[^"]*`)
|
|
|
|
magnetRegexRes := magnetRegex.FindStringSubmatch(string(resp.Data))
|
|
|
|
item.Size = info["Total size"]
|
|
|
|
item.RawName = doc.Find("title").Text()
|
|
|
|
item.RawName = strings.Replace(item.RawName, "Download ", "", 1)
|
|
|
|
item.RawName = strings.TrimSpace(strings.Replace(item.RawName, "Torrent | 1337x", " ", 1))
|
|
|
|
item.Name = c.formatter(item.RawName)
|
|
|
|
item.Download = magnetRegexRes[0]
|
|
|
|
item.Author = strings.Replace(c.source, "-torrents", "", -1)
|
|
|
|
return item, nil
|
|
|
|
}
|
|
|
|
|
2024-11-16 00:48:48 -05:00
|
|
|
func (c *s1337xCrawler) CrawlMulti(pages []int) (res []*model.GameItem, err error) {
|
|
|
|
var items []*model.GameItem
|
2024-09-24 06:17:11 -04:00
|
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
for _, page := range pages {
|
|
|
|
if page > totalPageNum {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
items, err = c.Crawl(page)
|
|
|
|
res = append(res, items...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
2024-11-16 00:48:48 -05:00
|
|
|
func (c *s1337xCrawler) CrawlAll() (res []*model.GameItem, err error) {
|
2024-09-24 06:17:11 -04:00
|
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-11-16 00:48:48 -05:00
|
|
|
var items []*model.GameItem
|
2024-09-24 06:17:11 -04:00
|
|
|
for i := 1; i <= totalPageNum; i++ {
|
|
|
|
items, err = c.Crawl(i)
|
|
|
|
res = append(res, items...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *s1337xCrawler) GetTotalPageNum() (int, error) {
|
|
|
|
var resp *utils.FetchResponse
|
|
|
|
var doc *goquery.Document
|
|
|
|
var err error
|
|
|
|
|
|
|
|
requestUrl := fmt.Sprintf("%s/%s/%d/", constant.C1337xBaseURL, c.source, 1)
|
|
|
|
resp, err = utils.Fetch(utils.FetchConfig{
|
|
|
|
Url: requestUrl,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
doc, _ = goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
|
|
|
|
selection := doc.Find(".last")
|
|
|
|
pageStr, exist := selection.Find("a").Attr("href")
|
|
|
|
if !exist {
|
|
|
|
return 0, errors.New("total page num not found")
|
|
|
|
}
|
|
|
|
pageStr = strings.ReplaceAll(pageStr, c.source, "")
|
|
|
|
pageStr = strings.ReplaceAll(pageStr, "/", "")
|
|
|
|
totalPageNum, err := strconv.Atoi(pageStr)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return totalPageNum, nil
|
|
|
|
}
|