143 lines
3.7 KiB
Go
143 lines
3.7 KiB
Go
package crawler
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"pcgamedb/constant"
|
|
"pcgamedb/db"
|
|
"pcgamedb/model"
|
|
"pcgamedb/utils"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type SteamRIPCrawler struct {
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewSteamRIPCrawler(logger *zap.Logger) *SteamRIPCrawler {
|
|
return &SteamRIPCrawler{
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
func (c *SteamRIPCrawler) Name() string {
|
|
return "SteamRIPCrawler"
|
|
}
|
|
|
|
func (c *SteamRIPCrawler) CrawlByUrl(URL string) (*model.GameItem, error) {
|
|
resp, err := utils.Request().Get(URL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
item, err := db.GetGameItemByUrl(URL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
item.RawName = strings.TrimSpace(doc.Find(".entry-title").First().Text())
|
|
item.Name = SteamRIPFormatter(item.RawName)
|
|
item.Url = URL
|
|
item.Author = "SteamRIP"
|
|
item.Platform = "windows"
|
|
sizeRegex := regexp.MustCompile(`(?i)<li><strong>Game Size:\s?</strong>(.*?)</li>`)
|
|
sizeRegexRes := sizeRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(sizeRegexRes) != 0 {
|
|
item.Size = strings.TrimSpace(sizeRegexRes[1])
|
|
} else {
|
|
item.Size = "unknown"
|
|
}
|
|
megadbRegex := regexp.MustCompile(`(?i)(?:https?:)?(//megadb\.net/[^"]+)`)
|
|
megadbRegexRes := megadbRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(megadbRegexRes) != 0 {
|
|
item.Download = fmt.Sprintf("https:%s", megadbRegexRes[1])
|
|
}
|
|
if item.Download == "" {
|
|
gofileRegex := regexp.MustCompile(`(?i)(?:https?:)?(//gofile\.io/d/[^"]+)`)
|
|
gofileRegexRes := gofileRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(gofileRegexRes) != 0 {
|
|
item.Download = fmt.Sprintf("https:%s", gofileRegexRes[1])
|
|
}
|
|
}
|
|
if item.Download == "" {
|
|
filecryptRegex := regexp.MustCompile(`(?i)(?:https?:)?(//filecrypt\.co/Container/[^"]+)`)
|
|
filecryptRegexRes := filecryptRegex.FindStringSubmatch(string(resp.Body()))
|
|
if len(filecryptRegexRes) != 0 {
|
|
item.Download = fmt.Sprintf("https:%s", filecryptRegexRes[1])
|
|
}
|
|
}
|
|
if item.Download == "" {
|
|
return nil, errors.New("failed to find download link")
|
|
}
|
|
|
|
return item, nil
|
|
}
|
|
|
|
func (c *SteamRIPCrawler) Crawl(num int) ([]*model.GameItem, error) {
|
|
count := 0
|
|
resp, err := utils.Request().Get(constant.SteamRIPGameListURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body()))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var items []*model.GameItem
|
|
var urls []string
|
|
var updateFlags []string // title
|
|
doc.Find(".az-list-item>a").Each(func(i int, s *goquery.Selection) {
|
|
u, exist := s.Attr("href")
|
|
if !exist {
|
|
return
|
|
}
|
|
urls = append(urls, fmt.Sprintf("%s%s", constant.SteamRIPBaseURL, u))
|
|
updateFlags = append(updateFlags, s.Text())
|
|
})
|
|
for i, u := range urls {
|
|
if count == num {
|
|
break
|
|
}
|
|
if db.IsSteamRIPCrawled(updateFlags[i]) {
|
|
continue
|
|
}
|
|
c.logger.Info("Crawling", zap.String("URL", u))
|
|
item, err := c.CrawlByUrl(u)
|
|
if err != nil {
|
|
c.logger.Warn("Failed to crawl", zap.Error(err), zap.String("URL", u))
|
|
continue
|
|
}
|
|
item.UpdateFlag = updateFlags[i]
|
|
if err := db.SaveGameItem(item); err != nil {
|
|
c.logger.Warn("Failed to save item", zap.Error(err))
|
|
continue
|
|
}
|
|
items = append(items, item)
|
|
count++
|
|
if err := OrganizeGameItem(item); err != nil {
|
|
c.logger.Warn("Failed to organize", zap.Error(err), zap.String("URL", u))
|
|
continue
|
|
}
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
func (c *SteamRIPCrawler) CrawlAll() ([]*model.GameItem, error) {
|
|
return c.Crawl(-1)
|
|
}
|
|
|
|
func SteamRIPFormatter(name string) string {
|
|
name = regexp.MustCompile(`\([^\)]+\)`).ReplaceAllString(name, "")
|
|
name = strings.Replace(name, "Free Download", "", -1)
|
|
name = strings.TrimSpace(name)
|
|
return name
|
|
}
|