This commit is contained in:
2022-10-20 10:28:21 +02:00
commit 205b16263c
10 changed files with 544 additions and 0 deletions

117
parsers/base.py Normal file
View File

@@ -0,0 +1,117 @@
import abc
import asyncio
from dataclasses import dataclass
from datetime import date
import hashlib
from typing import Optional
from bs4 import BeautifulSoup
from sqlitedict import SqliteDict
import httpx
@dataclass
class AnnoncementPreview:
title: str
update_date: Optional[date]
link: str
@dataclass
class Announcement:
title: str
description: str
price: int
update_date: date
link: str
class BaseParser(abc.ABC):
BASE_SEARCH_LINK = ""
BASE_PARAMS = {}
PAGE_PARAM = ""
@classmethod
def process_previews_page(cls, bs: BeautifulSoup) -> list[AnnoncementPreview]:
...
@classmethod
@abc.abstractmethod
def process_annoncement_data(cls, bs: BeautifulSoup, preview: AnnoncementPreview) -> Announcement:
...
@classmethod
async def get_annoncement_by_preview(cls, preview: AnnoncementPreview) -> Optional[Announcement]:
print(f"Get annoncement by link: {preview.link} ...")
async with httpx.AsyncClient() as client:
try:
response = await client.get(preview.link)
except httpx.ConnectError:
return None
except httpx.ReadTimeout:
return None
except httpx.ConnectTimeout:
return None
bs = BeautifulSoup(response.text, features="html.parser")
return cls.process_annoncement_data(bs, preview)
@classmethod
async def parse(cls, db: SqliteDict, queue: asyncio.Queue) -> None:
page = 1
while page <= 20:
params = {
**cls.BASE_PARAMS,
cls.PAGE_PARAM: page
}
print(f"Get {cls.__name__} page {page} previews...")
async with httpx.AsyncClient() as client:
try:
response = await client.get(cls.BASE_SEARCH_LINK, params=params)
except httpx.ReadTimeout:
return
bs = BeautifulSoup(response.text, features="html.parser")
previews = cls.process_previews_page(bs)
last_annoncement_date = None
for preview in previews:
if preview.update_date:
last_annoncement_date = preview.update_date
if preview.update_date is not None and preview.update_date != date.today():
continue
link_hash = hashlib.md5(preview.link.encode()).hexdigest()
if db.get(link_hash, None) == date.today().isoformat():
last_annoncement_date = date.today()
continue
annoncement = await cls.get_annoncement_by_preview(preview)
if annoncement:
await queue.put(annoncement)
last_annoncement_date = annoncement.update_date
page += 1
if last_annoncement_date is None:
continue
if (date.today() - last_annoncement_date).days >= 2:
break
@classmethod
async def start_parse(cls, db: SqliteDict, queue: asyncio.Queue) -> None:
while True:
await cls.parse(db, queue)
await asyncio.sleep(180)