import os import re import traceback import uuid from typing import Union from urllib.parse import urljoin import aiohttp import ujson as json from bs4 import BeautifulSoup from config import Config from core.logger import Logger web_render = Config('web_render_local') async def get_pic(link, source) -> Union[str, bool]: if not web_render: return False try: Logger.info('Starting find section..') try: async with aiohttp.ClientSession() as session: async with session.get(web_render + 'source?url=' + link, timeout=aiohttp.ClientTimeout(total=20)) as req: html = await req.read() except: traceback.print_exc() return False soup = BeautifulSoup(html, 'html.parser') pagename = uuid.uuid4() url = os.path.abspath(f'./cache/{pagename}.html') if os.path.exists(url): os.remove(url) Logger.info('Downloaded raw.') open_file = open(url, 'a', encoding='utf-8') def join_url(base, target): target = target.split(' ') targetlist = [] for x in target: if x.find('/') != -1: x = urljoin(base, x) targetlist.append(x) target = ' '.join(targetlist) return target open_file.write('\n') for x in soup.find_all('html'): fl = [] for f in x.attrs: if isinstance(x.attrs[f], str): fl.append(f'{f}="{x.attrs[f]}"') elif isinstance(x.attrs[f], list): fl.append(f'{f}="{" ".join(x.attrs[f])}"') open_file.write(f'') open_file.write('\n') for x in soup.find_all(rel='stylesheet'): if x.has_attr('href'): x.attrs['href'] = re.sub( ';', '&', urljoin(link, x.get('href'))) open_file.write(str(x)) for x in soup.find_all(): if x.has_attr('href'): x.attrs['href'] = re.sub( ';', '&', urljoin(link, x.get('href'))) open_file.write('') for x in soup.find_all('style'): open_file.write(str(x)) for x in soup.find_all('body'): if x.has_attr('class'): open_file.write( f'') for x in soup.find_all(['a', 'img', 'span']): if x.has_attr('href'): x.attrs['href'] = join_url(link, x.get('href')) if x.has_attr('src'): x.attrs['src'] = join_url(link, x.get('src')) if x.has_attr('srcset'): x.attrs['srcset'] = join_url(link, x.get('srcset')) if x.has_attr('style'): x.attrs['style'] = re.sub( r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style')) if source == 'collins': open_file.write('
') content = soup.select_one( '.dictionaries > .dictionary, .dictionaries.dictionary') trash = content.select( '.hwd_sound, .cobuild-logo, .pronIPASymbol, .title_frequency_container') if trash is not None: for x in trash: x.decompose() elif source == 'yd': open_file.write('
') content = soup.select_one('.basic') else: return False open_file.write(str(content)) w = 1000 open_file.write('
') open_file.write('') open_file.close() read_file = open(url, 'r', encoding='utf-8') html = {'content': read_file.read(), 'width': w} Logger.info('Start rendering...') picname = os.path.abspath(f'./cache/{pagename}.jpg') if os.path.exists(picname): os.remove(picname) async with aiohttp.ClientSession() as session: async with session.post(web_render, headers={ 'Content-Type': 'application/json', }, data=json.dumps(html)) as resp: with open(picname, 'wb+') as jpg: jpg.write(await resp.read()) return picname except Exception: traceback.print_exc() return False