308 lines
7.4 KiB
Go
308 lines
7.4 KiB
Go
package crawler
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"game-crawler/cache"
|
|
"game-crawler/constant"
|
|
"game-crawler/db"
|
|
"game-crawler/model"
|
|
"net/http"
|
|
"path"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.nite07.com/nite/ccs"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type NxbrewCrawler struct {
|
|
*BaseLogger
|
|
*BaseError
|
|
cfClearanceUrl string
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewNxbrewCrawler(cfClearanceUrl string, logger *zap.Logger) *NxbrewCrawler {
|
|
return &NxbrewCrawler{
|
|
BaseLogger: &BaseLogger{
|
|
logger: logger,
|
|
},
|
|
BaseError: &BaseError{},
|
|
cfClearanceUrl: cfClearanceUrl,
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
var sizeRegex = regexp.MustCompile(`[.\d]+\s?(GB|MB|KB)`)
|
|
|
|
func (c *NxbrewCrawler) Name() string {
|
|
return "Nxbrew"
|
|
}
|
|
|
|
func (c *NxbrewCrawler) getSession() (*ccs.Session, error) {
|
|
key := "nxbrew_session"
|
|
var session ccs.Session
|
|
var err error
|
|
if val, exists := cache.Get(key); exists {
|
|
err := json.Unmarshal([]byte(val), &session)
|
|
if err != nil {
|
|
return nil, c.ErrUnmarshalJSON(err)
|
|
}
|
|
return &session, nil
|
|
}
|
|
|
|
session, err = ccs.WAFSession(c.cfClearanceUrl, constant.NxbrewBaseURL)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(c.cfClearanceUrl, err)
|
|
}
|
|
|
|
jsonBytes, err := json.Marshal(session)
|
|
if err != nil {
|
|
return nil, c.ErrMarshalJSON(err)
|
|
}
|
|
|
|
_ = cache.SetWithExpire(key, string(jsonBytes), 24*time.Hour)
|
|
|
|
return &session, nil
|
|
}
|
|
|
|
func (c *NxbrewCrawler) CrawlByUrl(URL string) (*model.GameItem, error) {
|
|
c.LogCrawlByUrlStart(URL)
|
|
session, err := c.getSession()
|
|
if err != nil {
|
|
return nil, c.ErrGetSession(err)
|
|
}
|
|
|
|
resp, err := ccs.RequestWithWAFSession(http.MethodGet, URL, *session, nil)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(URL, err)
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(resp.Body))
|
|
if err != nil {
|
|
return nil, c.ErrParseDoc(URL, err)
|
|
}
|
|
|
|
item, err := db.GetGameItemByUrl(URL)
|
|
if err != nil {
|
|
return nil, c.ErrDBQuery(err)
|
|
}
|
|
|
|
item.Author = "Nxbrew"
|
|
|
|
doc.Find(".wp-block-media-text__content strong").Each(func(i int, s *goquery.Selection) {
|
|
if strings.TrimSpace(s.Text()) == "Title:" {
|
|
for _, node := range s.Nodes {
|
|
if node.NextSibling != nil {
|
|
item.Name = strings.TrimSpace(node.NextSibling.Data)
|
|
item.RawName = item.Name
|
|
break
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
if item.Name == "" {
|
|
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to get game name"))
|
|
}
|
|
|
|
item.Platform = "switch"
|
|
item.Url = URL
|
|
|
|
downloadLinks := map[string]string{}
|
|
|
|
// var getLinkErr error
|
|
doc.Find(".wp-block-columns").Each(func(i int, s *goquery.Selection) {
|
|
if s.Children().Length() == 2 {
|
|
label := s.Children().First().Text()
|
|
if i == 0 {
|
|
item.Size = sizeRegex.FindString(label)
|
|
}
|
|
s.Children().Eq(1).Find("p").Each(func(j int, s2 *goquery.Selection) {
|
|
s2.Find("a").Each(func(k int, s3 *goquery.Selection) {
|
|
link := s3.AttrOr("href", "")
|
|
if link != "" {
|
|
// if strings.Contains(link, "ouo.io") {
|
|
// link, err = c.getLinkFromOUO(link)
|
|
// if err != nil {
|
|
// getLinkErr = err
|
|
// return
|
|
// }
|
|
// }
|
|
// if strings.Contains(link, "mgnet.site") {
|
|
// link, err = c.getLinkFromMgnet(link)
|
|
// if err != nil {
|
|
// getLinkErr = err
|
|
// return
|
|
// }
|
|
// }
|
|
if s2.Find("strong").Length() != 0 {
|
|
downloadLinks[fmt.Sprintf("%s(%s %s)", strings.TrimSpace(label), strings.TrimSpace(s2.Find("strong").Text()), strings.TrimSpace(s3.Text()))] = link
|
|
} else {
|
|
downloadLinks[fmt.Sprintf("%s(%s)", strings.TrimSpace(label), strings.TrimSpace(s3.Text()))] = link
|
|
}
|
|
}
|
|
})
|
|
})
|
|
}
|
|
})
|
|
// if getLinkErr != nil {
|
|
// return nil, c.ErrGetGameItemDetail(URL, getLinkErr)
|
|
// }
|
|
item.Downloads = downloadLinks
|
|
if item.Size == "" {
|
|
item.Size = "unknown"
|
|
}
|
|
|
|
return item, nil
|
|
}
|
|
|
|
func (c *NxbrewCrawler) Crawl(page int) ([]*model.GameItem, error) {
|
|
c.LogCrawlStart(page)
|
|
session, err := c.getSession()
|
|
if err != nil {
|
|
return nil, c.ErrGetSession(err)
|
|
}
|
|
resp, err := ccs.RequestWithWAFSession(http.MethodGet, fmt.Sprintf(constant.NxbrewURL, page), *session, nil)
|
|
if err != nil {
|
|
return nil, c.ErrRequest(fmt.Sprintf(constant.NxbrewURL, page), err)
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(resp.Body))
|
|
if err != nil {
|
|
return nil, c.ErrParseDoc(fmt.Sprintf(constant.NxbrewURL, page), err)
|
|
}
|
|
var urls []string
|
|
var updateFlags []string // base64(url+updateTime)
|
|
doc.Find("article").Each(func(i int, s *goquery.Selection) {
|
|
u := s.Find(".post-title a").AttrOr("href", "")
|
|
if u == "" {
|
|
return
|
|
}
|
|
urls = append(urls, u)
|
|
d := s.Find(".post-date").Text()
|
|
updateFlags = append(updateFlags, base64.StdEncoding.EncodeToString([]byte(u+d)))
|
|
})
|
|
|
|
var res []*model.GameItem
|
|
for i, u := range urls {
|
|
if db.IsGameCrawled(updateFlags[i], "Nxbrew") {
|
|
c.LogCrawlSkip(u)
|
|
continue
|
|
}
|
|
|
|
item, err := c.CrawlByUrl(u)
|
|
if err != nil {
|
|
c.LogCrawlByUrlError(u, err)
|
|
continue
|
|
}
|
|
item.UpdateFlag = updateFlags[i]
|
|
|
|
err = db.SaveGameItem(item)
|
|
if err != nil {
|
|
c.LogSaveGameItemError(u, err)
|
|
continue
|
|
}
|
|
|
|
res = append(res, item)
|
|
|
|
if err := OrganizeGameItem(item); err != nil {
|
|
c.LogOrganizeGameWarn(item.ID, item.Name, err)
|
|
continue
|
|
}
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (c *NxbrewCrawler) CrawlAll() ([]*model.GameItem, error) {
|
|
totalPage, err := c.GetTotalPageNum()
|
|
if err != nil {
|
|
return nil, c.ErrGetTotalPageNum(err)
|
|
}
|
|
|
|
var res []*model.GameItem
|
|
for i := 1; i <= totalPage; i++ {
|
|
items, err := c.Crawl(i)
|
|
if err != nil {
|
|
c.LogCrawlError(i, err)
|
|
continue
|
|
}
|
|
res = append(res, items...)
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (c *NxbrewCrawler) CrawlMulti(pages []int) (res []*model.GameItem, err error) {
|
|
totalPageNum, err := c.GetTotalPageNum()
|
|
if err != nil {
|
|
return nil, c.ErrGetTotalPageNum(err)
|
|
}
|
|
|
|
for _, page := range pages {
|
|
if page > totalPageNum || page < 1 {
|
|
c.LogPageExceedWarn(page)
|
|
continue
|
|
}
|
|
|
|
items, err := c.Crawl(page)
|
|
if err != nil {
|
|
c.LogCrawlError(page, err)
|
|
continue
|
|
}
|
|
res = append(res, items...)
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (c *NxbrewCrawler) GetTotalPageNum() (int, error) {
|
|
session, err := c.getSession()
|
|
if err != nil {
|
|
return 0, c.ErrGetSession(err)
|
|
}
|
|
resp, err := ccs.RequestWithWAFSession(http.MethodGet, fmt.Sprintf(constant.NxbrewURL, 1), *session, nil)
|
|
if err != nil {
|
|
return 0, c.ErrRequest(fmt.Sprintf(constant.NxbrewURL, 1), err)
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(resp.Body))
|
|
if err != nil {
|
|
return 0, c.ErrParseDoc(fmt.Sprintf(constant.NxbrewURL, 1), err)
|
|
}
|
|
pageStr, _ := doc.Find("a.last").Attr("href")
|
|
totalPage, err := strconv.Atoi(path.Base(pageStr))
|
|
if err != nil {
|
|
return 0, c.ErrParseInt(pageStr, err)
|
|
}
|
|
return totalPage, nil
|
|
}
|
|
|
|
// func (c *NxbrewCrawler) getLinkFromOUO(URL string) (string, error) {
|
|
// key := fmt.Sprintf("ouo:%s", url.QueryEscape(URL))
|
|
// if val, exists := cache.Get(key); exists {
|
|
// return val, nil
|
|
// }
|
|
// res, err := utils.GetLinkFromOUO(URL)
|
|
// if err != nil {
|
|
// return URL, fmt.Errorf("failed to get link from ouo: %s, error: %s", URL, err)
|
|
// }
|
|
// _ = cache.Set(key, res)
|
|
// return res, nil
|
|
// }
|
|
|
|
// func (c *NxbrewCrawler) getLinkFromMgnet(URL string) (string, error) {
|
|
// key := fmt.Sprintf("mgnet:%s", url.QueryEscape(URL))
|
|
// if val, exists := cache.Get(key); exists {
|
|
// return val, nil
|
|
// }
|
|
// res, err := utils.GetLinkFromMgnet(URL)
|
|
// if err != nil {
|
|
// return URL, fmt.Errorf("failed to get link from mgnet: %s, error: %s", URL, err)
|
|
// }
|
|
// _ = cache.Set(key, res)
|
|
// return res, nil
|
|
// }
|