329 lines
8.7 KiB
Go
329 lines
8.7 KiB
Go
package crawler
|
||
|
||
import (
|
||
"bytes"
|
||
"encoding/base64"
|
||
"encoding/json"
|
||
"errors"
|
||
"fmt"
|
||
"net/http"
|
||
"net/url"
|
||
"regexp"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"game-crawler/cache"
|
||
"game-crawler/constant"
|
||
"game-crawler/db"
|
||
"game-crawler/model"
|
||
"game-crawler/utils"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
type OnlineFixCrawler struct {
|
||
*BaseLogger
|
||
*BaseError
|
||
username string
|
||
password string
|
||
logger *zap.Logger
|
||
}
|
||
|
||
func NewOnlineFixCrawler(username, password string, logger *zap.Logger) *OnlineFixCrawler {
|
||
return &OnlineFixCrawler{
|
||
BaseLogger: &BaseLogger{
|
||
logger: logger,
|
||
},
|
||
BaseError: &BaseError{},
|
||
username: username,
|
||
password: password,
|
||
logger: logger,
|
||
}
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) Name() string {
|
||
return "OnlineFixCrawler"
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) Crawl(page int) ([]*model.GameItem, error) {
|
||
c.LogCrawlStart(page)
|
||
cookies, err := c.getCookies()
|
||
if err != nil {
|
||
return nil, c.ErrGetSession(err)
|
||
}
|
||
|
||
requestURL := fmt.Sprintf("%s/page/%d/", constant.OnlineFixURL, page)
|
||
resp, err := utils.Request().SetLogger(c.logger.Sugar()).SetHeader("Referer", constant.OnlineFixURL).SetCookies(cookies).Get(requestURL)
|
||
if err != nil {
|
||
return nil, c.ErrRequest(requestURL, err)
|
||
}
|
||
|
||
body := utils.Windows1251ToUTF8(resp.Body())
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
return nil, c.ErrParseDoc(requestURL, err)
|
||
}
|
||
|
||
var urls []string
|
||
var updateFlags []string
|
||
doc.Find("article.news").Each(func(i int, s *goquery.Selection) {
|
||
url := s.Find(".big-link").First().AttrOr("href", "")
|
||
if url != "" {
|
||
urls = append(urls, url)
|
||
updateFlags = append(updateFlags, base64.StdEncoding.EncodeToString([]byte(url+s.Find("time").Text())))
|
||
}
|
||
})
|
||
|
||
var res []*model.GameItem
|
||
for i, u := range urls {
|
||
if db.IsOnlineFixCrawled(updateFlags[i]) {
|
||
c.LogCrawlSkip(u)
|
||
continue
|
||
}
|
||
|
||
item, err := c.CrawlByUrl(u)
|
||
if err != nil {
|
||
c.LogCrawlByUrlError(u, err)
|
||
continue
|
||
}
|
||
|
||
item.UpdateFlag = updateFlags[i]
|
||
if err := db.SaveGameItem(item); err != nil {
|
||
c.LogSaveGameItemError(u, err)
|
||
continue
|
||
}
|
||
res = append(res, item)
|
||
|
||
if err := OrganizeGameItem(item); err != nil {
|
||
c.LogOrganizeGameWarn(item.ID, item.Name, err)
|
||
continue
|
||
}
|
||
}
|
||
|
||
return res, nil
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) CrawlByUrl(URL string) (*model.GameItem, error) {
|
||
c.LogCrawlByUrlStart(URL)
|
||
cookies, err := c.getCookies()
|
||
if err != nil {
|
||
return nil, c.ErrGetSession(err)
|
||
}
|
||
|
||
resp, err := utils.Request().SetLogger(c.logger.Sugar()).SetHeaders(map[string]string{
|
||
"Referer": constant.OnlineFixURL,
|
||
}).SetCookies(cookies).Get(URL)
|
||
if err != nil {
|
||
return nil, c.ErrRequest(URL, err)
|
||
}
|
||
|
||
body := utils.Windows1251ToUTF8(resp.Body())
|
||
titleRegex := regexp.MustCompile(`(?i)<h1.*?>(.*?)</h1>`)
|
||
titleMatch := titleRegex.FindStringSubmatch(string(body))
|
||
if len(titleMatch) == 0 {
|
||
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to find title"))
|
||
}
|
||
|
||
item, err := db.GetGameItemByUrl(URL)
|
||
if err != nil {
|
||
item = &model.GameItem{}
|
||
}
|
||
|
||
item.RawName = titleMatch[1]
|
||
item.Name = OnlineFixFormatter(item.RawName)
|
||
item.Url = URL
|
||
item.Author = "OnlineFix"
|
||
item.Size = "0"
|
||
item.Platform = "windows"
|
||
|
||
downloadRegex := regexp.MustCompile(`(?i)<a[^>]+\bhref="([^"]+)"[^>]+>(Скачать с Online-Fix Hosters|Скачать Torrent|Скачать торрент)</a>`)
|
||
downloadMatch := downloadRegex.FindAllStringSubmatch(string(body), -1)
|
||
if len(downloadMatch) == 0 {
|
||
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to find download link"))
|
||
}
|
||
|
||
downloads := make(map[string]string)
|
||
for _, match := range downloadMatch {
|
||
downloadURL := match[1]
|
||
resp, err = utils.Request().SetLogger(c.logger.Sugar()).SetHeader("Referer", URL).SetCookies(cookies).Get(downloadURL)
|
||
body = utils.Windows1251ToUTF8(resp.Body())
|
||
|
||
if err != nil {
|
||
return nil, c.ErrRequest(downloadURL, err)
|
||
}
|
||
if strings.Contains(downloadURL, "uploads.online-fix.me") {
|
||
magnetRegex := regexp.MustCompile(`(?i)"(.*?).torrent"`)
|
||
magnetMatch := magnetRegex.FindStringSubmatch(string(body))
|
||
if len(magnetMatch) == 0 {
|
||
return nil, c.ErrGetGameItemDetail(URL, errors.New("failed to find magnet link"))
|
||
}
|
||
|
||
torrentURL := downloadURL + strings.Trim(magnetMatch[0], "\"")
|
||
resp, err = utils.Request().SetLogger(c.logger.Sugar()).SetHeader("Referer", URL).SetCookies(cookies).Get(torrentURL)
|
||
if err != nil {
|
||
return nil, c.ErrRequest(torrentURL, err)
|
||
}
|
||
|
||
magnet, size, err := utils.ConvertTorrentToMagnet(resp.Body())
|
||
if err != nil {
|
||
return nil, c.ErrConvertTorrent(err)
|
||
}
|
||
|
||
downloads["magnet"] = magnet
|
||
item.Size = size
|
||
}
|
||
if strings.Contains(downloadURL, "hosters.online-fix.me") {
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
return nil, c.ErrParseDoc(downloadURL, err)
|
||
}
|
||
|
||
type data struct {
|
||
DirectLink string `json:"direct_link"`
|
||
FileName string `json:"file_name"`
|
||
ID int `json:"id"`
|
||
}
|
||
var dataList []data
|
||
var jsons []string
|
||
doc.Find("div[data-links]").Each(func(i int, s *goquery.Selection) {
|
||
jsons = append(jsons, s.AttrOr("data-links", ""))
|
||
})
|
||
for _, jsonStr := range jsons {
|
||
if err := json.Unmarshal([]byte(jsonStr), &dataList); err != nil {
|
||
return nil, c.ErrUnmarshalJSON(err)
|
||
}
|
||
for _, d := range dataList {
|
||
downloads[fmt.Sprintf("%s(%v)", d.FileName, d.ID)] = d.DirectLink
|
||
}
|
||
}
|
||
}
|
||
}
|
||
item.Downloads = downloads
|
||
|
||
return item, nil
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) CrawlMulti(pages []int) ([]*model.GameItem, error) {
|
||
totalPageNum, err := c.GetTotalPageNum()
|
||
if err != nil {
|
||
return nil, c.ErrGetTotalPageNum(err)
|
||
}
|
||
var res []*model.GameItem
|
||
for _, page := range pages {
|
||
if page > totalPageNum || page < 1 {
|
||
c.LogPageExceedWarn(page)
|
||
continue
|
||
}
|
||
|
||
items, err := c.Crawl(page)
|
||
if err != nil {
|
||
c.LogCrawlError(page, err)
|
||
continue
|
||
}
|
||
res = append(res, items...)
|
||
}
|
||
return res, nil
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) CrawlAll() ([]*model.GameItem, error) {
|
||
var res []*model.GameItem
|
||
totalPageNum, err := c.GetTotalPageNum()
|
||
if err != nil {
|
||
return nil, c.ErrGetTotalPageNum(err)
|
||
}
|
||
|
||
for i := 1; i <= totalPageNum; i++ {
|
||
items, err := c.Crawl(i)
|
||
if err != nil {
|
||
c.LogCrawlError(i, err)
|
||
continue
|
||
}
|
||
res = append(res, items...)
|
||
}
|
||
return res, nil
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) GetTotalPageNum() (int, error) {
|
||
resp, err := utils.Request().SetLogger(c.logger.Sugar()).SetHeader("Referer", constant.OnlineFixURL).Get(constant.OnlineFixURL)
|
||
if err != nil {
|
||
return 0, c.ErrRequest(constant.OnlineFixURL, err)
|
||
}
|
||
|
||
pageRegex := regexp.MustCompile(`(?i)<a href="https://online-fix.me/page/(\d+)/">.*?</a>`)
|
||
pageMatches := pageRegex.FindAllStringSubmatch(string(resp.Body()), -1)
|
||
if len(pageMatches) < 2 {
|
||
return 0, c.ErrMatchRegex(string(resp.Body()), pageRegex, errors.New("failed to find total page number"))
|
||
}
|
||
|
||
totalPageNum, err := strconv.Atoi(pageMatches[len(pageMatches)-2][1])
|
||
if err != nil {
|
||
return 0, c.ErrParseInt(pageMatches[len(pageMatches)-2][1], err)
|
||
}
|
||
|
||
return totalPageNum, nil
|
||
}
|
||
|
||
func (c *OnlineFixCrawler) getCookies() ([]*http.Cookie, error) {
|
||
if c.username == "" || c.password == "" {
|
||
return nil, c.ErrUserOrPassEmpty()
|
||
}
|
||
key := "onlinefix_cookies"
|
||
val, exists := cache.Get(key)
|
||
if exists {
|
||
var cookies []*http.Cookie
|
||
err := json.Unmarshal([]byte(val), &cookies)
|
||
if err != nil {
|
||
return nil, c.ErrUnmarshalJSON(err)
|
||
}
|
||
return cookies, nil
|
||
}
|
||
|
||
resp, err := utils.Request().SetLogger(c.logger.Sugar()).SetHeaders(map[string]string{
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
"Referer": constant.OnlineFixURL,
|
||
}).Get(constant.OnlineFixCSRFURL)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to fetch CSRF token: %w", err)
|
||
}
|
||
|
||
type csrf struct {
|
||
Field string `json:"field"`
|
||
Value string `json:"value"`
|
||
}
|
||
|
||
var csrfToken csrf
|
||
if err = json.Unmarshal(resp.Body(), &csrfToken); err != nil {
|
||
return nil, c.ErrUnmarshalJSON(err)
|
||
}
|
||
|
||
cookies := resp.Cookies()
|
||
params := url.Values{}
|
||
params.Add("login_name", c.username)
|
||
params.Add("login_password", c.password)
|
||
params.Add(csrfToken.Field, csrfToken.Value)
|
||
params.Add("login", "submit")
|
||
|
||
resp, err = utils.Request().SetLogger(c.logger.Sugar()).SetHeaders(map[string]string{
|
||
"Origin": constant.OnlineFixURL,
|
||
"Content-Type": "application/x-www-form-urlencoded",
|
||
"Referer": constant.OnlineFixURL,
|
||
}).SetCookies(cookies).SetBody(params.Encode()).Post(constant.OnlineFixURL)
|
||
if err != nil {
|
||
return nil, c.ErrRequest(constant.OnlineFixURL, err)
|
||
}
|
||
|
||
cookies = resp.Cookies()
|
||
cookiesJSON, _ := json.Marshal(cookies)
|
||
_ = cache.SetWithExpire("onlinefix_cookies", string(cookiesJSON), time.Hour)
|
||
|
||
return cookies, nil
|
||
}
|
||
|
||
func OnlineFixFormatter(name string) string {
|
||
name = strings.ReplaceAll(name, "по сети", "")
|
||
reg := regexp.MustCompile(`(?i)\(.*?\)`)
|
||
return strings.TrimSpace(reg.ReplaceAllString(name, ""))
|
||
}
|