akari-bot/modules/dictionary/screenshot.py

import os
import re
import traceback
import uuid
from typing import Union
from urllib.parse import urljoin

import aiohttp
import ujson as json
from bs4 import BeautifulSoup

from config import Config
from core.logger import Logger

web_render = Config('web_render_local')


async def get_pic(link, source) -> Union[str, bool]:
    if not web_render:
        return False
    try:
        Logger.info('Starting find section..')
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(web_render + 'source?url=' + link,
                                       timeout=aiohttp.ClientTimeout(total=20)) as req:
                    html = await req.read()
        except:
            traceback.print_exc()
            return False
        soup = BeautifulSoup(html, 'html.parser')
        pagename = uuid.uuid4()
        url = os.path.abspath(f'./cache/{pagename}.html')
        if os.path.exists(url):
            os.remove(url)
        Logger.info('Downloaded raw.')
        open_file = open(url, 'a', encoding='utf-8')

        def join_url(base, target):
            target = target.split(' ')
            targetlist = []
            for x in target:
                if x.find('/') != -1:
                    x = urljoin(base, x)
                targetlist.append(x)
            target = ' '.join(targetlist)
            return target

        open_file.write('<!DOCTYPE html>\n')
        for x in soup.find_all('html'):
            fl = []
            for f in x.attrs:
                if isinstance(x.attrs[f], str):
                    fl.append(f'{f}="{x.attrs[f]}"')
                elif isinstance(x.attrs[f], list):
                    fl.append(f'{f}="{" ".join(x.attrs[f])}"')
            open_file.write(f'<html {" ".join(fl)}>')

        open_file.write('<head>\n')
        for x in soup.find_all(rel='stylesheet'):
            if x.has_attr('href'):
                x.attrs['href'] = re.sub(
                    ';', '&', urljoin(link, x.get('href')))
            open_file.write(str(x))

        for x in soup.find_all():
            if x.has_attr('href'):
                x.attrs['href'] = re.sub(
                    ';', '&', urljoin(link, x.get('href')))
        open_file.write('</head>')

        for x in soup.find_all('style'):
            open_file.write(str(x))

        for x in soup.find_all('body'):
            if x.has_attr('class'):
                open_file.write(
                    f'<body class="{" ".join(x.get("class"))}">')

        for x in soup.find_all(['a', 'img', 'span']):
            if x.has_attr('href'):
                x.attrs['href'] = join_url(link, x.get('href'))
            if x.has_attr('src'):
                x.attrs['src'] = join_url(link, x.get('src'))
            if x.has_attr('srcset'):
                x.attrs['srcset'] = join_url(link, x.get('srcset'))
            if x.has_attr('style'):
                x.attrs['style'] = re.sub(
                    r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
        if source == 'collins':
            open_file.write('<div id="main_content" class="he dc page">')
            content = soup.select_one(
                '.dictionaries > .dictionary, .dictionaries.dictionary')
            trash = content.select(
                '.hwd_sound, .cobuild-logo, .pronIPASymbol, .title_frequency_container')
            if trash is not None:
                for x in trash:
                    x.decompose()
        elif source == 'yd':
            open_file.write('<div class="simple basic">')
            content = soup.select_one('.basic')
        else:
            return False
        open_file.write(str(content))
        w = 1000
        open_file.write('</div></body>')
        open_file.write('</html>')
        open_file.close()
        read_file = open(url, 'r', encoding='utf-8')
        html = {'content': read_file.read(), 'width': w}
        Logger.info('Start rendering...')
        picname = os.path.abspath(f'./cache/{pagename}.jpg')
        if os.path.exists(picname):
            os.remove(picname)
        async with aiohttp.ClientSession() as session:
            async with session.post(web_render, headers={
                'Content-Type': 'application/json',
            }, data=json.dumps(html)) as resp:
                with open(picname, 'wb+') as jpg:
                    jpg.write(await resp.read())
        return picname
    except Exception:
        traceback.print_exc()
        return False
Add Collins English word search 2022-07-26 12:25:18 +00:00			`import os`
			`import re`
			`import traceback`
			`import uuid`
			`from typing import Union`
			`from urllib.parse import urljoin`

			`import aiohttp`
			`import ujson as json`
remove dictionary return contents check since api has a high probability of failing to check (reach the limit) 2022-07-27 14:40:08 +00:00			`from bs4 import BeautifulSoup`
Add Collins English word search 2022-07-26 12:25:18 +00:00
			`from config import Config`
			`from core.logger import Logger`

add webrender_local config field 2022-08-21 03:22:45 +00:00			`web_render = Config('web_render_local')`
Add Collins English word search 2022-07-26 12:25:18 +00:00

			`async def get_pic(link, source) -> Union[str, bool]:`
			`if not web_render:`
			`return False`
			`try:`
			`Logger.info('Starting find section..')`
			`try:`
			`async with aiohttp.ClientSession() as session:`
auto code cleanup 2022-07-31 08:33:20 +00:00			`async with session.get(web_render + 'source?url=' + link,`
			`timeout=aiohttp.ClientTimeout(total=20)) as req:`
Add Collins English word search 2022-07-26 12:25:18 +00:00			`html = await req.read()`
			`except:`
			`traceback.print_exc()`
			`return False`
			`soup = BeautifulSoup(html, 'html.parser')`
			`pagename = uuid.uuid4()`
			`url = os.path.abspath(f'./cache/{pagename}.html')`
			`if os.path.exists(url):`
			`os.remove(url)`
			`Logger.info('Downloaded raw.')`
			`open_file = open(url, 'a', encoding='utf-8')`

			`def join_url(base, target):`
			`target = target.split(' ')`
			`targetlist = []`
			`for x in target:`
			`if x.find('/') != -1:`
			`x = urljoin(base, x)`
			`targetlist.append(x)`
			`target = ' '.join(targetlist)`
			`return target`

			`open_file.write('<!DOCTYPE html>\n')`
			`for x in soup.find_all('html'):`
			`fl = []`
			`for f in x.attrs:`
			`if isinstance(x.attrs[f], str):`
			`fl.append(f'{f}="{x.attrs[f]}"')`
			`elif isinstance(x.attrs[f], list):`
			`fl.append(f'{f}="{" ".join(x.attrs[f])}"')`
			`open_file.write(f'<html {" ".join(fl)}>')`

			`open_file.write('<head>\n')`
			`for x in soup.find_all(rel='stylesheet'):`
			`if x.has_attr('href'):`
			`x.attrs['href'] = re.sub(`
			`';', '&', urljoin(link, x.get('href')))`
			`open_file.write(str(x))`

			`for x in soup.find_all():`
			`if x.has_attr('href'):`
			`x.attrs['href'] = re.sub(`
			`';', '&', urljoin(link, x.get('href')))`
			`open_file.write('</head>')`

			`for x in soup.find_all('style'):`
			`open_file.write(str(x))`

			`for x in soup.find_all('body'):`
			`if x.has_attr('class'):`
			`open_file.write(`
			`f'<body class="{" ".join(x.get("class"))}">')`

			`for x in soup.find_all(['a', 'img', 'span']):`
			`if x.has_attr('href'):`
			`x.attrs['href'] = join_url(link, x.get('href'))`
			`if x.has_attr('src'):`
			`x.attrs['src'] = join_url(link, x.get('src'))`
			`if x.has_attr('srcset'):`
			`x.attrs['srcset'] = join_url(link, x.get('srcset'))`
			`if x.has_attr('style'):`
			`x.attrs['style'] = re.sub(`
			`r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))`
			`if source == 'collins':`
			`open_file.write('<div id="main_content" class="he dc page">')`
Add dirty_check to dict 2022-07-26 12:54:57 +00:00			`content = soup.select_one(`
			`'.dictionaries > .dictionary, .dictionaries.dictionary')`
			`trash = content.select(`
			`'.hwd_sound, .cobuild-logo, .pronIPASymbol, .title_frequency_container')`
			`if trash is not None:`
			`for x in trash:`
			`x.decompose()`
Add Collins English word search 2022-07-26 12:25:18 +00:00			`elif source == 'yd':`
			`open_file.write('<div class="simple basic">')`
			`content = soup.select_one('.basic')`
Add dirty_check to dict 2022-07-26 12:54:57 +00:00			`else:`
			`return False`
remove dictionary return contents check since api has a high probability of failing to check (reach the limit) 2022-07-27 14:40:08 +00:00			`open_file.write(str(content))`
Add Collins English word search 2022-07-26 12:25:18 +00:00			`w = 1000`
			`open_file.write('</div></body>')`
			`open_file.write('</html>')`
			`open_file.close()`
			`read_file = open(url, 'r', encoding='utf-8')`
			`html = {'content': read_file.read(), 'width': w}`
			`Logger.info('Start rendering...')`
			`picname = os.path.abspath(f'./cache/{pagename}.jpg')`
			`if os.path.exists(picname):`
			`os.remove(picname)`
			`async with aiohttp.ClientSession() as session:`
			`async with session.post(web_render, headers={`
			`'Content-Type': 'application/json',`
			`}, data=json.dumps(html)) as resp:`
			`with open(picname, 'wb+') as jpg:`
			`jpg.write(await resp.read())`
			`return picname`
			`except Exception:`
			`traceback.print_exc()`
			`return False`