Merge pull request #20 from ThePhaseless/seleniumbase

Seleniumbase
This commit is contained in:
Jakub Orchowski 2024-11-25 00:07:12 +01:00 committed by GitHub
commit f141fc3952
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 1093 additions and 560 deletions

View File

@ -1,27 +1,5 @@
FROM python:3.12
# Inspired by https://github.com/Hudrolax/uc-docker-alpine/
RUN apt update && apt upgrade -y && apt install -y chromium xvfb
# Install build dependencies
RUN apt update && apt upgrade -y && apt install -y\
curl \
wget \
unzip \
gnupg \
bash \
stow
# Install dependencies
RUN apt install -y \
xvfb \
x11vnc \
fluxbox \
xterm \
git \
ca-certificates \
pipx \
chromium
RUN pipx install poetry
ENV DISPLAY=:0
# RUN poetry config virtualenvs.in-project true
RUN curl -sSL https://install.python-poetry.org | python3 -

View File

@ -1,17 +1,10 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"build": {
"dockerfile": "Dockerfile"
},
"runArgs": [
"-p",
"8181:8191",
"--cap-add",
"SYS_ADMIN"
],
"customizations": {
"vscode": {
"extensions": [
@ -27,13 +20,18 @@
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
"forwardPorts": [
5900
]
// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "pip3 install --user -r requirements.txt",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
"name": "Byparr Dev Container",
"runArgs": [
"-p",
"8181:8191"
],
"features": {
"ghcr.io/devcontainers-extra/features/act:1": {}
}
}

View File

@ -22,7 +22,43 @@ env:
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Set up Poetry
run: pip install poetry
- name: Setup a local virtual environment (if no poetry.toml file)
run: |
poetry config virtualenvs.create true --local
poetry config virtualenvs.in-project true --local
- uses: actions/cache@v4
name: Define a cache for the virtual environment based on the dependencies lock file
with:
path: ./.venv
key: venv-${{ hashFiles('poetry.lock') }}
- name: Install dependencies
run: |
poetry install
sudo apt update
sudo apt install -y xvfb scrot python3-tk
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo apt install -y ./google-chrome-stable_current_amd64.deb
rm ./google-chrome-stable_current_amd64.deb
- name: Run tests
run: poetry run pytest
build:
needs: test
runs-on: ubuntu-latest
permissions:
contents: read
@ -31,14 +67,6 @@ jobs:
# with sigstore/fulcio when running outside of PRs.
id-token: write
strategy:
fail-fast: false
matrix:
platform:
- linux/amd64
- linux/arm64
- linux/arm64/v8
steps:
- name: Checkout repository
uses: actions/checkout@v4
@ -92,7 +120,7 @@ jobs:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
platforms: ${{ matrix.platform }}
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: type=gha,mode=max
build-args: GITHUB_BUILD=true

8
.gitignore vendored
View File

@ -162,4 +162,10 @@ cython_debug/
#.idea/
.extentions/
core
core
# Screenshots
*.png
# Downloaded files
downloaded_files/

View File

@ -1,45 +1,11 @@
FROM python:3.12-alpine
FROM python:3.12-slim
# Inspired by https://github.com/Hudrolax/uc-docker-alpine/
ARG GITHUB_BUILD=false
ENV GITHUB_BUILD=${GITHUB_BUILD}
# Install build dependencies
RUN apk update && apk upgrade && \
apk add --no-cache --virtual .build-deps \
alpine-sdk \
curl \
wget \
unzip \
gnupg
# Install dependencies
RUN apk add --no-cache \
xvfb \
x11vnc \
fluxbox \
xterm \
libffi-dev \
openssl-dev \
zlib-dev \
bzip2-dev \
readline-dev \
git \
nss \
freetype \
freetype-dev \
harfbuzz \
ca-certificates \
ttf-freefont \
pipx \
chromium \
chromium-chromedriver
WORKDIR /app
EXPOSE 8191
# python
ENV HOME=/root
ENV \
DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
@ -50,13 +16,15 @@ ENV \
POETRY_VIRTUALENVS_IN_PROJECT=true \
DISPLAY=:0
RUN pipx install poetry
ENV PATH="/root/.local/bin:$PATH"
WORKDIR /app
EXPOSE 8191
RUN apt update &&\
apt install -y xvfb scrot python3-tk curl chromium
RUN curl -sSL https://install.python-poetry.org | python3 -
ENV PATH="${HOME}/.local/bin:$PATH"
COPY pyproject.toml poetry.lock ./
RUN poetry install
COPY fix_nodriver.py ./
RUN . /app/.venv/bin/activate && python fix_nodriver.py
COPY . .
RUN ./run_vnc.sh && . /app/.venv/bin/activate && poetry run pytest
CMD ["./entrypoint.sh"]
CMD [". .venv/bin/activate && python3 main.py"]

View File

@ -1,6 +0,0 @@
#!/bin/sh
./run_vnc.sh
# Activate virtual environment
. .venv/bin/activate && python3 main.py

81
main.py
View File

@ -1,17 +1,19 @@
from __future__ import annotations
import asyncio
import logging
import time
import uvicorn
import uvicorn.config
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException
from fastapi.responses import RedirectResponse
from sbase import SB, BaseCase
from src.models.requests import LinkRequest, LinkResponse
import src
import src.utils
import src.utils.consts
from src.models.requests import LinkRequest, LinkResponse, Solution
from src.utils import logger
from src.utils.browser import bypass_cloudflare, new_browser
from src.utils.consts import LOG_LEVEL
app = FastAPI(debug=LOG_LEVEL == logging.DEBUG, log_level=LOG_LEVEL)
@ -28,50 +30,53 @@ def read_root():
async def health_check():
"""Health check endpoint."""
logger.info("Health check")
browser = await new_browser()
await browser.grant_all_permissions()
page = await browser.get("https://google.com")
await page.bring_to_front()
browser.stop()
# browser: Chrome = await new_browser()
# browser.get("https://google.com")
# browser.stop()
return {"status": "ok"}
@app.post("/v1")
async def read_item(request: LinkRequest):
def read_item(request: LinkRequest):
"""Handle POST requests."""
start_time = int(time.time() * 1000)
# request.url = "https://nowsecure.nl"
logger.info(f"Request: {request}")
start_time = int(time.time() * 1000)
browser = await new_browser()
await browser.grant_all_permissions()
await asyncio.sleep(1)
page = await browser.get(request.url)
await page.bring_to_front()
timeout = request.maxTimeout
if timeout == 0:
timeout = None
try:
challenged = await asyncio.wait_for(bypass_cloudflare(page), timeout=timeout)
except asyncio.TimeoutError as e:
logger.info("Timed out bypassing Cloudflare")
browser.stop()
raise HTTPException(
detail="Timed out bypassing Cloudflare", status_code=408
) from e
except Exception as e:
logger.error(e)
browser.stop()
raise HTTPException(detail="Couldn't bypass", status_code=500) from e
response: LinkResponse
logger.info(f"Got webpage: {request.url}")
# start_time = int(time.time() * 1000)
with SB(uc=True, locale_code="en", test=False, xvfb=True, ad_block=True) as sb:
sb: BaseCase
sb.uc_open_with_reconnect(request.url)
sb.uc_gui_click_captcha()
logger.info(f"Got webpage: {request.url}")
sb.save_screenshot("screenshot.png")
logger.info(f"Got webpage: {request.url}")
response = await LinkResponse.create(
page=page,
start_timestamp=start_time,
challenged=challenged,
)
source = sb.get_page_source()
source_bs = BeautifulSoup(source, "html.parser")
title_tag = source_bs.title
if title_tag is None:
raise HTTPException(status_code=500, detail="Title tag not found")
if title_tag.string in src.utils.consts.CHALLENGE_TITLES:
raise HTTPException(status_code=500, detail="Could not bypass challenge")
title = title_tag.string
logger.info(f"Title: {title}")
response = LinkResponse(
message="Success",
solution=Solution(
userAgent=sb.get_user_agent(),
url=sb.get_current_url(),
status=200,
cookies=sb.get_cookies(),
headers={},
response=source,
),
startTimestamp=start_time,
)
browser.stop()
return response

1071
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -10,10 +10,13 @@ readme = "README.md"
python = "^3.12"
pytest = "^8"
fastapi = { extras = ["standard"], version = "^0" }
nodriver = "^0"
requests = "^2"
httpx = "^0"
httpx = "^0.27"
pytest-asyncio = "^0"
ruff = "^0.8.0"
seleniumbase = "^4.32.12"
pyautogui = "^0.9.54"
beautifulsoup4 = "^4.12.3"
[build-system]
@ -35,9 +38,9 @@ ignore = [
"ERA001",
"COM812",
"ISC001",
"TCH003",
"TCH002",
"TCH001",
"TC003",
"TC002",
"TC001",
"TD002",
"E501",
"D101",

View File

@ -1,3 +0,0 @@
if [ $(arch) = "x86_64" ]; then
./entrypoint.sh && . ./.venv/bin/activate && poetry run pytest
fi

View File

@ -1,16 +0,0 @@
#!/bin/sh
export DISPLAY=:0
rm -f /tmp/.X0-lock
# Run Xvfb on dispaly 0.
Xvfb :0 -screen 0 1280x720x16 &
# Run fluxbox windows manager on display 0.
fluxbox -display :0 &
# Run x11vnc on display 0
x11vnc -display :0 -forever -ncache 10 &
# Add delay
sleep 5

View File

@ -1,10 +1,9 @@
from __future__ import annotations
import re
import time
from http import HTTPStatus
from typing import Any
from nodriver import Tab
from pydantic import BaseModel
@ -26,6 +25,17 @@ class Solution(BaseModel):
headers: dict[str, Any]
response: str
@classmethod
def empty(cls):
return cls(
url="",
status=HTTPStatus.INTERNAL_SERVER_ERROR,
cookies=[],
userAgent="",
headers={},
response="",
)
class LinkResponse(BaseModel):
status: str = "ok"
@ -36,46 +46,16 @@ class LinkResponse(BaseModel):
version: str = "3.3.21" # TODO: Implement versioning
@classmethod
async def create(
cls,
page: Tab,
start_timestamp: int,
*,
challenged: bool = False,
):
message = "Passed challenge" if challenged else "Challenge not detected"
user_agent = await cls.get_useragent(page)
# cookies = await page.browser.cookies.get_all(requests_cookie_format=True)
# # Convert cookies to json
# cookies = [cookie.to_json() for cookie in cookies]
cookies = await page.browser.cookies.get_all()
solution = Solution(
url=page.url,
status=200,
cookies=cookies if cookies else [],
userAgent=user_agent,
headers={},
response=await page.get_content(),
)
def invalid(cls):
return cls(
message=message,
solution=solution,
startTimestamp=start_timestamp,
status="error",
message="Invalid request",
solution=Solution.empty(),
startTimestamp=int(time.time() * 1000),
endTimestamp=int(time.time() * 1000),
version="3.3.21",
)
@classmethod
async def get_useragent(cls, page):
user_agent = await page.js_dumps("navigator")
if not isinstance(user_agent, dict):
raise ProtectionTriggeredError("User agent is not a dictionary")
user_agent = user_agent["userAgent"]
re.sub(pattern="HEADLESS", repl="", string=user_agent, flags=re.IGNORECASE)
return user_agent
class NoChromeExtensionError(Exception):
"""No chrome extention found."""

View File

@ -1,131 +0,0 @@
import asyncio
import nodriver as webdriver
from nodriver.core.element import Element
from src.utils import logger
from src.utils.consts import CHALLENGE_TITLES, UBLOCK_TITLE
from src.utils.extensions import download_extensions
downloaded_extensions = download_extensions()
async def new_browser():
"""
Create a new browser instance with the specified configuration.
Returns
-------
A coroutine that resolves to the newly created browser instance.
Raises
------
Any exceptions that may occur during the creation of the browser instance.
"""
config: webdriver.Config = webdriver.Config(
browser_executable_path="/usr/bin/chromium", sandbox=True
)
config.add_argument(f"--load-extension={','.join(downloaded_extensions)}")
return await webdriver.start(config=config)
async def bypass_cloudflare(page: webdriver.Tab):
"""
Asynchronously bypasses Cloudflare challenges on the given web page.
Args:
----
page (webdriver.Tab): The web page to bypass Cloudflare challenges on.
Returns:
-------
bool: True if the page was successfully bypassed, False otherwise.
Raises:
------
Exception: If the element containing the Cloudflare challenge could not be found.
Notes:
-----
This function repeatedly checks the title of the page until it is not in the
list of known Cloudflare challenge titles. Once a challenge is found, it attempts
to locate the element containing the challenge and click it. If the element cannot
be found within a certain time limit, the function will retry. If the element is
found, it will be clicked. If the element cannot be found at all, an exception will
be raised.
"""
challenged = False
await page
while True:
logger.debug(f"Current page: {page.target.title}")
if page.target.title not in CHALLENGE_TITLES:
if page.target.title == UBLOCK_TITLE:
continue
return challenged
if not challenged:
logger.info("Found challenge")
challenged = True
if (
page.target.title != "Just a moment..."
): # If not in cloudflare, wait for autobypass
await asyncio.sleep(3)
logger.debug("Waiting for challenge to complete")
continue
loaded = False
try:
elem = await page.find("lds-ring", timeout=3)
except asyncio.TimeoutError:
logger.error(
"Couldn't find lds-ring, probably not a cloudflare challenge, trying again..."
)
continue
if elem is None:
logger.error("elem is None")
logger.debug(elem)
raise InvalidElementError
parent = elem.parent
if not isinstance(parent, Element) or parent.attributes is None:
logger.error("parent is not an element or has no attributes")
logger.debug(parent)
raise InvalidElementError
for attr in parent.attributes:
if attr == "display: none; visibility: hidden;" and not loaded:
loaded = True
logger.info("Page loaded")
if not loaded:
logger.debug("Challenge still loading")
continue
elem = await page.find("input")
elem = elem.parent
# Get the element containing the shadow root
if isinstance(elem, Element) and elem.shadow_roots:
logger.info("Found shadow root")
inner_elem = Element(elem.shadow_roots[0], page, elem.tree).children[0]
if isinstance(inner_elem, Element):
logger.info("Found elem inside shadow root")
logger.debug("Clicking element")
await inner_elem.mouse_click()
await asyncio.sleep(3)
continue
logger.warning(
"Couldn't find element containing shadow root, trying again..."
)
logger.debug(inner_elem)
else:
logger.warning("Couldn't find checkbox, trying again...")
logger.debug(elem)
class InvalidElementError(Exception):
pass

View File

@ -1,31 +1,12 @@
import logging
import os
from pathlib import Path
LOG_LEVEL = os.getenv("LOG_LEVEL") or "INFO"
LOG_LEVEL = logging.getLevelNamesMapping()[LOG_LEVEL.upper()]
UBLOCK_TITLE = "uBO Lite — Dashboard"
CHALLENGE_TITLES = [
# Cloudflare
"Just a moment...",
# DDoS-GUARD
"DDoS-Guard",
]
GITHUB_WEBSITES = [
"https://github.com/",
"https://www.github.com/",
"github.com",
"www.github.com",
]
EXTENSION_REPOSITIORIES = [
"OhMyGuus/I-Still-Dont-Care-About-Cookies",
"uBlockOrigin/uBOL-home",
]
SLEEP_SECONDS = 1
EXTENSIONS_PATH = Path(".extentions")

View File

@ -1,94 +0,0 @@
from __future__ import annotations
import io
import json
from pathlib import Path
from zipfile import ZipFile
import httpx
import requests
from src.models.github import GithubResponse
from src.models.requests import NoChromeExtensionError
from src.utils import logger
from src.utils.consts import EXTENSION_REPOSITIORIES, EXTENSIONS_PATH, GITHUB_WEBSITES
def get_latest_github_chrome_release(url: str):
"""
Get the latest release for chrome from GitHub for a given repository URL.
Args:
----
url (str): The URL of the GitHub repository.
Returns:
-------
GithubResponse: The latest release asset with 'chrom' in its name.
Raises:
------
httpx.NetworkError: If the request to GitHub API returns a 403 Forbidden status code.
NoChromeExtensionError: If no release asset with 'chrom' in its name is found.
"""
if url.startswith(tuple(GITHUB_WEBSITES)):
url = "/".join(url.split("/")[-2:])
url = "https://api.github.com/repos/" + url + "/releases/latest"
response = httpx.get(url)
if response.status_code == httpx.codes.FORBIDDEN:
error = json.loads(response.text)["message"]
logger.error(error)
raise httpx.NetworkError(error)
response = GithubResponse(**response.json())
for asset in response.assets:
if "chrom" in asset.name:
return asset
raise NoChromeExtensionError
def download_extensions():
"""
Download extensions from the specified repositories and saves them locally.
Returns
-------
list[str]: A list of paths to the downloaded extensions.
Raises
------
httpx.NetworkError: If there is an error downloading an extension.
"""
downloaded_extensions: list[str] = []
for repository in EXTENSION_REPOSITIORIES:
extension_name = repository.split("/")[-1]
path = Path(f"{EXTENSIONS_PATH}/{extension_name}")
try:
extension = get_latest_github_chrome_release(repository)
logger.info(
f"Downloading {extension_name} from {extension.browser_download_url}"
)
except httpx.NetworkError:
if path.is_dir():
logger.error(f"Error downloading {extension_name}, using local copy")
downloaded_extensions.append(path.as_posix())
continue
try:
zip_file = requests.get(extension.browser_download_url, timeout=10)
except UnboundLocalError as e:
logger.error(f"Error downloading {extension_name}, skipping")
logger.error(e)
continue
Path(EXTENSIONS_PATH).mkdir(exist_ok=True)
with ZipFile(io.BytesIO(zip_file.content)) as zip_obj:
if not path.joinpath(extension_name).exists():
zip_obj.extractall(f"{EXTENSIONS_PATH}/{extension_name}")
logger.debug(f"Extracted {extension_name} to {path}")
logger.info(f"Successfully downloaded {extension_name} to {path}")
downloaded_extensions.append(path.as_posix())
return downloaded_extensions

View File

@ -1,7 +1,4 @@
import os
import platform
from http import HTTPStatus
from time import sleep
import httpx
import pytest
@ -25,16 +22,12 @@ github_restricted = [
"https://speed.cd/login",
]
if os.getenv("GITHUB_ACTIONS") == "true":
test_websites.extend(github_restricted)
# if os.getenv("GITHUB_ACTIONS") != "true":
test_websites.extend(github_restricted)
@pytest.mark.parametrize("website", test_websites)
def test_bypass(website: str):
if (platform.machine() == "arm64") and os.getenv("GITHUB_ACTIONS") == "true":
pytest.skip("Skipping on arm64 due to lack of support")
sleep(3)
test_request = httpx.get(
website,
)