-
Notifications
You must be signed in to change notification settings - Fork 6
[RFC] Propose a minimal specialization for extract #12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,10 @@ | |
|
||
import asyncio | ||
import time | ||
from base64 import b64decode | ||
from collections.abc import Mapping | ||
from functools import partial | ||
from typing import Optional, Iterator, List | ||
from typing import Awaitable, Iterator, List, Optional, Union | ||
|
||
import aiohttp | ||
from aiohttp import TCPConnector | ||
|
@@ -16,9 +18,15 @@ | |
from ..apikey import get_apikey | ||
from ..constants import API_URL, API_TIMEOUT | ||
from ..stats import AggStats, ResponseStats | ||
from ..utils import user_agent | ||
from ..utils import _to_lower_camel_case, user_agent | ||
|
||
|
||
class _NotLoaded: | ||
pass | ||
|
||
|
||
_NOT_LOADED = _NotLoaded() | ||
|
||
# 120 seconds is probably too long, but we are concerned about the case with | ||
# many concurrent requests and some processing logic running in the same reactor, | ||
# thus, saturating the CPU. This will make timeouts more likely. | ||
|
@@ -43,6 +51,38 @@ def _post_func(session): | |
return session.post | ||
|
||
|
||
class ExtractResult(Mapping): | ||
"""Result of a call to AsyncClient.extract. | ||
|
||
It can be used as a dictionary to access the raw API response. | ||
|
||
It also provides some helper properties for easier access to some of its | ||
underlying data. | ||
""" | ||
|
||
def __init__(self, api_response: dict): | ||
self._api_response = api_response | ||
self._http_response_body: Union[bytes|_NotLoaded] = _NOT_LOADED | ||
|
||
def __getitem__(self, key): | ||
return self._api_response[key] | ||
|
||
def __iter__(self): | ||
yield from self._api_response | ||
|
||
def __len__(self): | ||
return len(self._api_response) | ||
|
||
@property | ||
def http_response_body(self) -> Union[bytes|_NotLoaded]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems there could be a big overlap with data structures defined in https://github.com/scrapinghub/web-poet/blob/master/web_poet/page_inputs/http.py There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 I'm thinking if Later on, we'd need the functionalities of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’ll see you and raise you: scrapy-plugins/scrapy-zyte-api#10 may be in a similar situation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I believe it's both. We'd want the ecosystem revolving around Zyte's extraction and crawling be similar in interface (i.e. I'm trying to think if there are some downsides in using
After trying to make "httpResponseHeaders": [..., {'name': 'content-type', 'value': 'text/html; charset=UTF-8'}, ...] To search for this value, we'd need to iterate through the list of header key-value pairs. On the other hand, >>> headers = HttpResponseHeaders.from_name_value_pairs(zyte_api_response["httpResponseHeaders"])
>>> headers.get('Content-Type')
'text/html; charset=UTF-8' There's a lot of benefits to using the existing features in For the browserHtml, I think it would be worth representing it with another class, since |
||
if self._http_response_body is _NOT_LOADED: | ||
base64_body = self._api_response.get("httpResponseBody", None) | ||
if base64_body is None: | ||
raise ValueError("API response has no httpResponseBody key.") | ||
self._http_response_body = b64decode(base64_body) | ||
return self._http_response_body | ||
|
||
|
||
class AsyncClient: | ||
def __init__(self, *, | ||
api_key=None, | ||
|
@@ -148,3 +188,44 @@ async def _request(query): | |
session=session) | ||
|
||
return asyncio.as_completed([_request(query) for query in queries]) | ||
|
||
@staticmethod | ||
def _build_extract_query(raw_query): | ||
return { | ||
_to_lower_camel_case(k): v | ||
for k, v in raw_query.items() | ||
} | ||
|
||
async def extract( | ||
self, | ||
url: str, | ||
*, | ||
session: Optional[aiohttp.ClientSession] = None, | ||
handle_retries: bool = True, | ||
retrying: Optional[AsyncRetrying] = None, | ||
**kwargs, | ||
) -> ExtractResult: | ||
"""…""" | ||
query = self._build_extract_query({**kwargs, 'url': url}) | ||
response = await self.request_raw( | ||
query=query, | ||
endpoint='extract', | ||
session=session, | ||
handle_retries=handle_retries, | ||
retrying=retrying, | ||
) | ||
return ExtractResult(response) | ||
|
||
def extract_in_parallel( | ||
self, | ||
queries: List[dict], | ||
*, | ||
session: Optional[aiohttp.ClientSession] = None, | ||
) -> Iterator[asyncio.Future]: | ||
"""…""" | ||
queries = [self._build_extract_query(query) for query in queries] | ||
return self.request_parallel_as_completed( | ||
queries, | ||
endpoint='extract', | ||
session=session, | ||
) |
Uh oh!
There was an error while loading. Please reload this page.