Merge pull request #20 from ThePhaseless/seleniumbase

Seleniumbase
2025-03-15 01:40:21 +08:00 · 2024-11-25 00:07:12 +01:00 · 2024-11-25 00:07:12 +01:00 · f141fc3952
commit f141fc3952
parent 462e73ac8a c4cb6e0ac3
16 changed files with 1093 additions and 560 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -1,27 +1,5 @@
 FROM python:3.12

-# Inspired by https://github.com/Hudrolax/uc-docker-alpine/
+RUN apt update && apt upgrade -y && apt install -y chromium xvfb

-# Install build dependencies
-RUN apt update && apt upgrade -y && apt install -y\
-    curl \
-    wget \
-    unzip \
-    gnupg \
-    bash \
-    stow
-
-# Install dependencies
-RUN apt install -y \
-    xvfb \
-    x11vnc \
-    fluxbox \
-    xterm \
-    git \
-    ca-certificates \
-    pipx \
-    chromium
-
-RUN pipx install poetry
-ENV DISPLAY=:0
-# RUN poetry config virtualenvs.in-project true
+RUN curl -sSL https://install.python-poetry.org | python3 -
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -1,17 +1,10 @@
 // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 // README at: https://github.com/devcontainers/templates/tree/main/src/python
 {
-  "name": "Python 3",
  // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
  "build": {
    "dockerfile": "Dockerfile"
  },
-  "runArgs": [
-    "-p",
-    "8181:8191",
-    "--cap-add",
-    "SYS_ADMIN"
-  ],
  "customizations": {
    "vscode": {
      "extensions": [
@ -27,13 +20,18 @@
  // Features to add to the dev container. More info: https://containers.dev/features.
  // "features": {},
  // Use 'forwardPorts' to make a list of ports inside the container available locally.
-  "forwardPorts": [
-    5900
-  ]
  // Use 'postCreateCommand' to run commands after the container is created.
  // "postCreateCommand": "pip3 install --user -r requirements.txt",
  // Configure tool-specific properties.
  // "customizations": {},
  // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
  // "remoteUser": "root"
+  "name": "Byparr Dev Container",
+  "runArgs": [
+    "-p",
+    "8181:8191"
+  ],
+  "features": {
+    "ghcr.io/devcontainers-extra/features/act:1": {}
+  }
 }
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@ -22,7 +22,43 @@ env:
  IMAGE_NAME: ${{ github.repository }}

 jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Set up Poetry
+        run: pip install poetry
+
+      - name: Setup a local virtual environment (if no poetry.toml file)
+        run: |
+          poetry config virtualenvs.create true --local
+          poetry config virtualenvs.in-project true --local
+
+      - uses: actions/cache@v4
+        name: Define a cache for the virtual environment based on the dependencies lock file
+        with:
+          path: ./.venv
+          key: venv-${{ hashFiles('poetry.lock') }}
+
+      - name: Install dependencies
+        run: |
+          poetry install
+          sudo apt update
+          sudo apt install -y xvfb scrot python3-tk
+          wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
+          sudo apt install -y ./google-chrome-stable_current_amd64.deb
+          rm ./google-chrome-stable_current_amd64.deb
+
+      - name: Run tests
+        run: poetry run pytest
+
  build:
+    needs: test
    runs-on: ubuntu-latest
    permissions:
      contents: read
@ -31,14 +67,6 @@ jobs:
      # with sigstore/fulcio when running outside of PRs.
      id-token: write

-    strategy:
-      fail-fast: false
-      matrix:
-        platform:
-          - linux/amd64
-          - linux/arm64
-          - linux/arm64/v8
-
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
@ -92,7 +120,7 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          platforms: ${{ matrix.platform }}
+          platforms: linux/amd64,linux/arm64
          cache-from: type=gha
          cache-to: type=gha,mode=max
          build-args: GITHUB_BUILD=true
--- a/.gitignore
+++ b/.gitignore
@ -162,4 +162,10 @@ cython_debug/
 #.idea/

 .extentions/
-core
+core
+
+# Screenshots
+*.png
+
+# Downloaded files
+downloaded_files/
--- a/52
+++ b/52
@ -1,45 +1,11 @@
-FROM python:3.12-alpine
+FROM python:3.12-slim

 # Inspired by https://github.com/Hudrolax/uc-docker-alpine/

 ARG GITHUB_BUILD=false
 ENV GITHUB_BUILD=${GITHUB_BUILD}

-# Install build dependencies
-RUN apk update && apk upgrade && \
-    apk add --no-cache --virtual .build-deps \
-    alpine-sdk \
-    curl \
-    wget \
-    unzip \
-    gnupg
-
-# Install dependencies
-RUN apk add --no-cache \
-    xvfb \
-    x11vnc \
-    fluxbox \
-    xterm \
-    libffi-dev \
-    openssl-dev \
-    zlib-dev \
-    bzip2-dev \
-    readline-dev \
-    git \
-    nss \
-    freetype \
-    freetype-dev \
-    harfbuzz \
-    ca-certificates \
-    ttf-freefont \
-    pipx \
-    chromium \
-    chromium-chromedriver
-
-WORKDIR /app
-EXPOSE 8191
-
-# python
+ENV HOME=/root
 ENV \
    DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
@ -50,13 +16,15 @@ ENV \
    POETRY_VIRTUALENVS_IN_PROJECT=true \
    DISPLAY=:0

-RUN pipx install poetry
-ENV PATH="/root/.local/bin:$PATH"
+WORKDIR /app
+EXPOSE 8191
+RUN apt update &&\
+    apt install -y xvfb scrot python3-tk curl chromium
+
+RUN curl -sSL https://install.python-poetry.org | python3 -
+ENV PATH="${HOME}/.local/bin:$PATH"
 COPY pyproject.toml poetry.lock ./
 RUN poetry install

-COPY fix_nodriver.py ./
-RUN . /app/.venv/bin/activate && python fix_nodriver.py
 COPY . .
-RUN ./run_vnc.sh && . /app/.venv/bin/activate && poetry run pytest
-CMD ["./entrypoint.sh"]
+CMD [". .venv/bin/activate && python3 main.py"]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -1,6 +0,0 @@
-#!/bin/sh
-
-./run_vnc.sh
-
-# Activate virtual environment
-. .venv/bin/activate && python3 main.py
--- a/main.py
+++ b/main.py
@ -1,17 +1,19 @@
 from __future__ import annotations

-import asyncio
 import logging
 import time

-import uvicorn
 import uvicorn.config
+from bs4 import BeautifulSoup
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import RedirectResponse
+from sbase import SB, BaseCase

-from src.models.requests import LinkRequest, LinkResponse
+import src
+import src.utils
+import src.utils.consts
+from src.models.requests import LinkRequest, LinkResponse, Solution
 from src.utils import logger
-from src.utils.browser import bypass_cloudflare, new_browser
 from src.utils.consts import LOG_LEVEL

 app = FastAPI(debug=LOG_LEVEL == logging.DEBUG, log_level=LOG_LEVEL)
@ -28,50 +30,53 @@ def read_root():
 async def health_check():
    """Health check endpoint."""
    logger.info("Health check")
-    browser = await new_browser()
-    await browser.grant_all_permissions()
-    page = await browser.get("https://google.com")
-    await page.bring_to_front()
-    browser.stop()
+    # browser: Chrome = await new_browser()
+    # browser.get("https://google.com")
+    # browser.stop()
    return {"status": "ok"}


@app.post("/v1")
-async def read_item(request: LinkRequest):
+def read_item(request: LinkRequest):
    """Handle POST requests."""
+    start_time = int(time.time() * 1000)
    # request.url = "https://nowsecure.nl"
    logger.info(f"Request: {request}")
-    start_time = int(time.time() * 1000)
-    browser = await new_browser()
-    await browser.grant_all_permissions()
-    await asyncio.sleep(1)
-    page = await browser.get(request.url)
-    await page.bring_to_front()
-    timeout = request.maxTimeout
-    if timeout == 0:
-        timeout = None
-    try:
-        challenged = await asyncio.wait_for(bypass_cloudflare(page), timeout=timeout)
-    except asyncio.TimeoutError as e:
-        logger.info("Timed out bypassing Cloudflare")
-        browser.stop()
-        raise HTTPException(
-            detail="Timed out bypassing Cloudflare", status_code=408
-        ) from e
-    except Exception as e:
-        logger.error(e)
-        browser.stop()
-        raise HTTPException(detail="Couldn't bypass", status_code=500) from e
+    response: LinkResponse

-    logger.info(f"Got webpage: {request.url}")
+    # start_time = int(time.time() * 1000)
+    with SB(uc=True, locale_code="en", test=False, xvfb=True, ad_block=True) as sb:
+        sb: BaseCase
+        sb.uc_open_with_reconnect(request.url)
+        sb.uc_gui_click_captcha()
+        logger.info(f"Got webpage: {request.url}")
+        sb.save_screenshot("screenshot.png")
+        logger.info(f"Got webpage: {request.url}")

-    response = await LinkResponse.create(
-        page=page,
-        start_timestamp=start_time,
-        challenged=challenged,
-    )
+        source = sb.get_page_source()
+        source_bs = BeautifulSoup(source, "html.parser")
+        title_tag = source_bs.title
+        if title_tag is None:
+            raise HTTPException(status_code=500, detail="Title tag not found")
+
+        if title_tag.string in src.utils.consts.CHALLENGE_TITLES:
+            raise HTTPException(status_code=500, detail="Could not bypass challenge")
+
+        title = title_tag.string
+        logger.info(f"Title: {title}")
+        response = LinkResponse(
+            message="Success",
+            solution=Solution(
+                userAgent=sb.get_user_agent(),
+                url=sb.get_current_url(),
+                status=200,
+                cookies=sb.get_cookies(),
+                headers={},
+                response=source,
+            ),
+            startTimestamp=start_time,
+        )

-    browser.stop()
    return response


--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,10 +10,13 @@ readme = "README.md"
 python = "^3.12"
 pytest = "^8"
 fastapi = { extras = ["standard"], version = "^0" }
-nodriver = "^0"
 requests = "^2"
-httpx = "^0"
+httpx = "^0.27"
 pytest-asyncio = "^0"
+ruff = "^0.8.0"
+seleniumbase = "^4.32.12"
+pyautogui = "^0.9.54"
+beautifulsoup4 = "^4.12.3"


 [build-system]
@ -35,9 +38,9 @@ ignore = [
    "ERA001",
    "COM812",
    "ISC001",
-    "TCH003",
-    "TCH002",
-    "TCH001",
+    "TC003",
+    "TC002",
+    "TC001",
    "TD002",
    "E501",
    "D101",
--- a/pytest.sh
+++ b/pytest.sh
@ -1,3 +0,0 @@
-if [ $(arch) = "x86_64" ]; then
-    ./entrypoint.sh && . ./.venv/bin/activate && poetry run pytest
-fi
--- a/run_vnc.sh
+++ b/run_vnc.sh
@ -1,16 +0,0 @@
-#!/bin/sh
-
-export DISPLAY=:0
-rm -f /tmp/.X0-lock
-
-# Run Xvfb on dispaly 0.
-Xvfb :0 -screen 0 1280x720x16 &
-
-# Run fluxbox windows manager on display 0.
-fluxbox -display :0 &
-
-# Run x11vnc on display 0
-x11vnc -display :0 -forever -ncache 10 &
-
-# Add delay
-sleep 5
--- a/src/models/requests.py
+++ b/src/models/requests.py
@ -1,10 +1,9 @@
 from __future__ import annotations

-import re
 import time
+from http import HTTPStatus
 from typing import Any

-from nodriver import Tab
 from pydantic import BaseModel


@ -26,6 +25,17 @@ class Solution(BaseModel):
    headers: dict[str, Any]
    response: str

+    @classmethod
+    def empty(cls):
+        return cls(
+            url="",
+            status=HTTPStatus.INTERNAL_SERVER_ERROR,
+            cookies=[],
+            userAgent="",
+            headers={},
+            response="",
+        )
+

 class LinkResponse(BaseModel):
    status: str = "ok"
@ -36,46 +46,16 @@ class LinkResponse(BaseModel):
    version: str = "3.3.21"  # TODO: Implement versioning

    @classmethod
-    async def create(
-        cls,
-        page: Tab,
-        start_timestamp: int,
-        *,
-        challenged: bool = False,
-    ):
-        message = "Passed challenge" if challenged else "Challenge not detected"
-
-        user_agent = await cls.get_useragent(page)
-
-        # cookies = await page.browser.cookies.get_all(requests_cookie_format=True)
-        # # Convert cookies to json
-        # cookies = [cookie.to_json() for cookie in cookies]
-
-        cookies = await page.browser.cookies.get_all()
-        solution = Solution(
-            url=page.url,
-            status=200,
-            cookies=cookies if cookies else [],
-            userAgent=user_agent,
-            headers={},
-            response=await page.get_content(),
-        )
-
+    def invalid(cls):
        return cls(
-            message=message,
-            solution=solution,
-            startTimestamp=start_timestamp,
+            status="error",
+            message="Invalid request",
+            solution=Solution.empty(),
+            startTimestamp=int(time.time() * 1000),
+            endTimestamp=int(time.time() * 1000),
+            version="3.3.21",
        )

-    @classmethod
-    async def get_useragent(cls, page):
-        user_agent = await page.js_dumps("navigator")
-        if not isinstance(user_agent, dict):
-            raise ProtectionTriggeredError("User agent is not a dictionary")
-        user_agent = user_agent["userAgent"]
-        re.sub(pattern="HEADLESS", repl="", string=user_agent, flags=re.IGNORECASE)
-        return user_agent
-

 class NoChromeExtensionError(Exception):
    """No chrome extention found."""
--- a/src/utils/browser.py
+++ b/src/utils/browser.py
@ -1,131 +0,0 @@
-import asyncio
-
-import nodriver as webdriver
-from nodriver.core.element import Element
-
-from src.utils import logger
-from src.utils.consts import CHALLENGE_TITLES, UBLOCK_TITLE
-from src.utils.extensions import download_extensions
-
-downloaded_extensions = download_extensions()
-
-
-async def new_browser():
-    """
-    Create a new browser instance with the specified configuration.
-
-    Returns
-    -------
-        A coroutine that resolves to the newly created browser instance.
-
-    Raises
-    ------
-        Any exceptions that may occur during the creation of the browser instance.
-
-    """
-    config: webdriver.Config = webdriver.Config(
-        browser_executable_path="/usr/bin/chromium", sandbox=True
-    )
-    config.add_argument(f"--load-extension={','.join(downloaded_extensions)}")
-
-    return await webdriver.start(config=config)
-
-
-async def bypass_cloudflare(page: webdriver.Tab):
-    """
-    Asynchronously bypasses Cloudflare challenges on the given web page.
-
-    Args:
-    ----
-        page (webdriver.Tab): The web page to bypass Cloudflare challenges on.
-
-    Returns:
-    -------
-        bool: True if the page was successfully bypassed, False otherwise.
-
-    Raises:
-    ------
-        Exception: If the element containing the Cloudflare challenge could not be found.
-
-    Notes:
-    -----
-        This function repeatedly checks the title of the page until it is not in the
-        list of known Cloudflare challenge titles. Once a challenge is found, it attempts
-        to locate the element containing the challenge and click it. If the element cannot
-        be found within a certain time limit, the function will retry. If the element is
-        found, it will be clicked. If the element cannot be found at all, an exception will
-        be raised.
-
-    """
-    challenged = False
-    await page
-    while True:
-        logger.debug(f"Current page: {page.target.title}")
-
-        if page.target.title not in CHALLENGE_TITLES:
-            if page.target.title == UBLOCK_TITLE:
-                continue
-            return challenged
-
-        if not challenged:
-            logger.info("Found challenge")
-            challenged = True
-
-        if (
-            page.target.title != "Just a moment..."
-        ):  # If not in cloudflare, wait for autobypass
-            await asyncio.sleep(3)
-            logger.debug("Waiting for challenge to complete")
-            continue
-
-        loaded = False
-        try:
-            elem = await page.find("lds-ring", timeout=3)
-        except asyncio.TimeoutError:
-            logger.error(
-                "Couldn't find lds-ring, probably not a cloudflare challenge, trying again..."
-            )
-            continue
-        if elem is None:
-            logger.error("elem is None")
-            logger.debug(elem)
-            raise InvalidElementError
-
-        parent = elem.parent
-        if not isinstance(parent, Element) or parent.attributes is None:
-            logger.error("parent is not an element or has no attributes")
-            logger.debug(parent)
-            raise InvalidElementError
-
-        for attr in parent.attributes:
-            if attr == "display: none; visibility: hidden;" and not loaded:
-                loaded = True
-                logger.info("Page loaded")
-
-        if not loaded:
-            logger.debug("Challenge still loading")
-            continue
-
-        elem = await page.find("input")
-        elem = elem.parent
-        # Get the element containing the shadow root
-        if isinstance(elem, Element) and elem.shadow_roots:
-            logger.info("Found shadow root")
-            inner_elem = Element(elem.shadow_roots[0], page, elem.tree).children[0]
-            if isinstance(inner_elem, Element):
-                logger.info("Found elem inside shadow root")
-                logger.debug("Clicking element")
-                await inner_elem.mouse_click()
-                await asyncio.sleep(3)
-                continue
-            logger.warning(
-                "Couldn't find element containing shadow root, trying again..."
-            )
-            logger.debug(inner_elem)
-        else:
-            logger.warning("Couldn't find checkbox, trying again...")
-            logger.debug(elem)
-
-
-class InvalidElementError(Exception):
-    pass
--- a/src/utils/consts.py
+++ b/src/utils/consts.py
@ -1,31 +1,12 @@
 import logging
 import os
-from pathlib import Path

 LOG_LEVEL = os.getenv("LOG_LEVEL") or "INFO"
 LOG_LEVEL = logging.getLevelNamesMapping()[LOG_LEVEL.upper()]

-UBLOCK_TITLE = "uBO Lite — Dashboard"
-
 CHALLENGE_TITLES = [
    # Cloudflare
    "Just a moment...",
    # DDoS-GUARD
    "DDoS-Guard",
 ]
-
-GITHUB_WEBSITES = [
-    "https://github.com/",
-    "https://www.github.com/",
-    "github.com",
-    "www.github.com",
-]
-
-EXTENSION_REPOSITIORIES = [
-    "OhMyGuus/I-Still-Dont-Care-About-Cookies",
-    "uBlockOrigin/uBOL-home",
-]
-
-SLEEP_SECONDS = 1
-
-EXTENSIONS_PATH = Path(".extentions")
--- a/src/utils/extensions.py
+++ b/src/utils/extensions.py
@ -1,94 +0,0 @@
-from __future__ import annotations
-
-import io
-import json
-from pathlib import Path
-from zipfile import ZipFile
-
-import httpx
-import requests
-
-from src.models.github import GithubResponse
-from src.models.requests import NoChromeExtensionError
-from src.utils import logger
-from src.utils.consts import EXTENSION_REPOSITIORIES, EXTENSIONS_PATH, GITHUB_WEBSITES
-
-
-def get_latest_github_chrome_release(url: str):
-    """
-    Get the latest release for chrome from GitHub for a given repository URL.
-
-    Args:
-    ----
-        url (str): The URL of the GitHub repository.
-
-    Returns:
-    -------
-        GithubResponse: The latest release asset with 'chrom' in its name.
-
-    Raises:
-    ------
-        httpx.NetworkError: If the request to GitHub API returns a 403 Forbidden status code.
-        NoChromeExtensionError: If no release asset with 'chrom' in its name is found.
-
-    """
-    if url.startswith(tuple(GITHUB_WEBSITES)):
-        url = "/".join(url.split("/")[-2:])
-    url = "https://api.github.com/repos/" + url + "/releases/latest"
-
-    response = httpx.get(url)
-    if response.status_code == httpx.codes.FORBIDDEN:
-        error = json.loads(response.text)["message"]
-        logger.error(error)
-        raise httpx.NetworkError(error)
-    response = GithubResponse(**response.json())
-
-    for asset in response.assets:
-        if "chrom" in asset.name:
-            return asset
-
-    raise NoChromeExtensionError
-
-
-def download_extensions():
-    """
-    Download extensions from the specified repositories and saves them locally.
-
-    Returns
-    -------
-        list[str]: A list of paths to the downloaded extensions.
-
-    Raises
-    ------
-        httpx.NetworkError: If there is an error downloading an extension.
-
-    """
-    downloaded_extensions: list[str] = []
-    for repository in EXTENSION_REPOSITIORIES:
-        extension_name = repository.split("/")[-1]
-        path = Path(f"{EXTENSIONS_PATH}/{extension_name}")
-        try:
-            extension = get_latest_github_chrome_release(repository)
-            logger.info(
-                f"Downloading {extension_name} from {extension.browser_download_url}"
-            )
-        except httpx.NetworkError:
-            if path.is_dir():
-                logger.error(f"Error downloading {extension_name}, using local copy")
-                downloaded_extensions.append(path.as_posix())
-                continue
-        try:
-            zip_file = requests.get(extension.browser_download_url, timeout=10)
-        except UnboundLocalError as e:
-            logger.error(f"Error downloading {extension_name}, skipping")
-            logger.error(e)
-            continue
-        Path(EXTENSIONS_PATH).mkdir(exist_ok=True)
-        with ZipFile(io.BytesIO(zip_file.content)) as zip_obj:
-            if not path.joinpath(extension_name).exists():
-                zip_obj.extractall(f"{EXTENSIONS_PATH}/{extension_name}")
-                logger.debug(f"Extracted {extension_name} to {path}")
-
-        logger.info(f"Successfully downloaded {extension_name} to {path}")
-        downloaded_extensions.append(path.as_posix())
-    return downloaded_extensions
--- a/tests/main_test.py
+++ b/tests/main_test.py
@ -1,7 +1,4 @@
-import os
-import platform
 from http import HTTPStatus
-from time import sleep

 import httpx
 import pytest
@ -25,16 +22,12 @@ github_restricted = [
    "https://speed.cd/login",
 ]

-if os.getenv("GITHUB_ACTIONS") == "true":
-    test_websites.extend(github_restricted)
+# if os.getenv("GITHUB_ACTIONS") != "true":
+test_websites.extend(github_restricted)


@pytest.mark.parametrize("website", test_websites)
 def test_bypass(website: str):
-    if (platform.machine() == "arm64") and os.getenv("GITHUB_ACTIONS") == "true":
-        pytest.skip("Skipping on arm64 due to lack of support")
-
-    sleep(3)
    test_request = httpx.get(
        website,
    )