Archived
1
0
Fork 0
This repository has been archived on 2024-04-26. You can view files and clone it, but cannot push or open issues or pull requests.
akari-bot/modules/dictionary/screenshot.py
2023-04-30 11:30:59 +08:00

124 lines
4.4 KiB
Python

import os
import re
import traceback
import uuid
from typing import Union
from urllib.parse import urljoin
import aiohttp
import ujson as json
from bs4 import BeautifulSoup
from config import Config
from core.logger import Logger
web_render = Config('web_render_local')
async def get_pic(link, source) -> Union[str, bool]:
if not web_render:
return False
try:
Logger.info('Starting find section..')
try:
async with aiohttp.ClientSession() as session:
async with session.get(web_render + 'source?url=' + link,
timeout=aiohttp.ClientTimeout(total=20)) as req:
html = await req.read()
except BaseException:
traceback.print_exc()
return False
soup = BeautifulSoup(html, 'html.parser')
pagename = uuid.uuid4()
url = os.path.abspath(f'./cache/{pagename}.html')
if os.path.exists(url):
os.remove(url)
Logger.info('Downloaded raw.')
open_file = open(url, 'a', encoding='utf-8')
def join_url(base, target):
target = target.split(' ')
targetlist = []
for x in target:
if x.find('/') != -1:
x = urljoin(base, x)
targetlist.append(x)
target = ' '.join(targetlist)
return target
open_file.write('<!DOCTYPE html>\n')
for x in soup.find_all('html'):
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<html {" ".join(fl)}>')
open_file.write('<head>\n')
for x in soup.find_all(rel='stylesheet'):
if x.has_attr('href'):
x.attrs['href'] = re.sub(
';', '&', urljoin(link, x.get('href')))
open_file.write(str(x))
for x in soup.find_all():
if x.has_attr('href'):
x.attrs['href'] = re.sub(
';', '&', urljoin(link, x.get('href')))
open_file.write('</head>')
for x in soup.find_all('style'):
open_file.write(str(x))
for x in soup.find_all('body'):
if x.has_attr('class'):
open_file.write(
f'<body class="{" ".join(x.get("class"))}">')
for x in soup.find_all(['a', 'img', 'span']):
if x.has_attr('href'):
x.attrs['href'] = join_url(link, x.get('href'))
if x.has_attr('src'):
x.attrs['src'] = join_url(link, x.get('src'))
if x.has_attr('srcset'):
x.attrs['srcset'] = join_url(link, x.get('srcset'))
if x.has_attr('style'):
x.attrs['style'] = re.sub(
r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
if source == 'collins':
open_file.write('<div id="main_content" class="he dc page">')
content = soup.select_one(
'.dictionaries > .dictionary, .dictionaries.dictionary')
trash = content.select(
'.hwd_sound, .cobuild-logo, .pronIPASymbol, .title_frequency_container')
if trash is not None:
for x in trash:
x.decompose()
elif source == 'yd':
open_file.write('<div class="simple basic">')
content = soup.select_one('.basic')
else:
return False
open_file.write(str(content))
w = 1000
open_file.write('</div></body>')
open_file.write('</html>')
open_file.close()
read_file = open(url, 'r', encoding='utf-8')
html = {'content': read_file.read(), 'width': w}
Logger.info('Start rendering...')
picname = os.path.abspath(f'./cache/{pagename}.jpg')
if os.path.exists(picname):
os.remove(picname)
async with aiohttp.ClientSession() as session:
async with session.post(web_render, headers={
'Content-Type': 'application/json',
}, data=json.dumps(html)) as resp:
with open(picname, 'wb+') as jpg:
jpg.write(await resp.read())
return picname
except Exception:
traceback.print_exc()
return False