Skip to content

Add test server and some top level Crawler tests #517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,15 @@ keywords = [
dependencies = [
"apify-client>=1.11.0",
"apify-shared>=1.3.0",
"crawlee~=0.6.0",
"crawlee[parsel]~=0.6.0",
"cryptography>=42.0.0",
"httpx>=0.27.0",
# TODO: ensure compatibility with the latest version of lazy-object-proxy
# https://github.com/apify/apify-sdk-python/issues/460
"lazy-object-proxy<1.11.0",
"more_itertools>=10.2.0",
"typing-extensions>=4.1.0",
"uvicorn",
"websockets>=14.0",
]

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/actor_source_base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ RUN echo "Python version:" \
&& echo "All installed Python packages:" \
&& pip freeze

CMD ["python3", "-m", "src"]
CMD ["sh", "-c", "python test_server.py & python -m src"]
98 changes: 98 additions & 0 deletions tests/integration/actor_source_base/test_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
For example:
http://localhost:8080/ contains links:
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9

http://localhost:8080/1 contains links:
http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19

... and so on.
"""

import asyncio
import logging
from collections.abc import Awaitable, Callable, Coroutine
from typing import Any

from uvicorn import Config
from uvicorn.server import Server
from yarl import URL

Receive = Callable[[], Awaitable[dict[str, Any]]]
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]


async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
"""Send an HTML response to the client."""
await send(
{
'type': 'http.response.start',
'status': status,
'headers': [[b'content-type', b'text/html; charset=utf-8']],
}
)
await send({'type': 'http.response.body', 'body': html_content})


async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
"""Main ASGI application handler that routes requests to specific handlers.

Args:
scope: The ASGI connection scope.
_: The ASGI receive function.
send: The ASGI send function.
"""
assert scope['type'] == 'http'
path = scope['path']

links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
await send_html_response(
send,
f"""\
<html><head>
<title>Title for {path} </title>
</head>
<body>
{links}
</body></html>""".encode(),
)


class TestServer(Server):
"""A test HTTP server implementation based on Uvicorn Server."""

@property
def url(self) -> URL:
"""Get the base URL of the server.

Returns:
A URL instance with the server's base URL.
"""
protocol = 'https' if self.config.is_ssl else 'http'
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')

async def serve(self) -> None:
"""Run the server."""
self.restart_requested = asyncio.Event()

loop = asyncio.get_event_loop()
tasks = {
loop.create_task(super().serve()),
}
await asyncio.wait(tasks)


if __name__ == '__main__':
asyncio.run(
TestServer(
config=Config(
app=app,
lifespan='off',
loop='asyncio',
port=8080,
log_config=None,
log_level=logging.CRITICAL,
)
).serve()
)
4 changes: 2 additions & 2 deletions tests/integration/test_actor_api_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,12 @@ async def main_server() -> None:
async with Actor:

class WebhookHandler(BaseHTTPRequestHandler):
def do_GET(self) -> None: # noqa: N802
def do_GET(self) -> None:
self.send_response(200)
self.end_headers()
self.wfile.write(bytes('Hello, world!', encoding='utf-8'))

def do_POST(self) -> None: # noqa: N802
def do_POST(self) -> None:
nonlocal webhook_body
content_length = self.headers.get('content-length')
length = int(content_length) if content_length else 0
Expand Down
75 changes: 75 additions & 0 deletions tests/integration/test_crawlers_with_storages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from tests.integration.conftest import MakeActorFunction, RunActorFunction


async def test_actor_on_platform_max_crawl_depth(
make_actor: MakeActorFunction,
run_actor: RunActorFunction,
) -> None:
"""Test that the actor respects max_crawl_depth."""

async def main() -> None:
"""The crawler entry point."""
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

from apify import Actor

async with Actor:
crawler = ParselCrawler(max_crawl_depth=2)
finished = []
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')

@crawler.router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
await context.enqueue_links(include=[enqueue_pattern])
await context.push_data({'Url': context.request.url})
finished.append(context.request.url)

await crawler.run(['http://localhost:8080/'])
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
# assert some dataset

actor = await make_actor(label='parsel-crawler', main_func=main)
run_result = await run_actor(actor)

assert run_result.status == 'SUCCEEDED'


async def test_actor_on_platform_max_requests_per_crawl(
make_actor: MakeActorFunction,
run_actor: RunActorFunction,
) -> None:
"""Test that the actor respects max_requests_per_crawl."""

async def main() -> None:
"""The crawler entry point."""
from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

from apify import Actor

async with Actor:
crawler = ParselCrawler(
max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1)
)
finished = []

@crawler.router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
await context.enqueue_links()
await context.push_data({'Url': context.request.url})
finished.append(context.request.url)

await crawler.run(['http://localhost:8080/'])
assert len(finished) == 3
# assert some dataset

actor = await make_actor(label='parsel-crawler', main_func=main)
run_result = await run_actor(actor)

assert run_result.status == 'SUCCEEDED'
Loading
Loading