Archived
1
0
Fork 0
This repository has been archived on 2024-04-26. You can view files and clone it, but cannot push or open issues or pull requests.
akari-bot/modules/wiki/utils/screenshot_image.py

327 lines
13 KiB
Python
Raw Normal View History

2021-02-01 15:13:11 +00:00
import os
import re
import traceback
import uuid
2021-10-31 15:59:18 +00:00
from typing import Union
from urllib.parse import urljoin
2021-02-01 15:13:11 +00:00
import aiohttp
2021-09-10 18:05:27 +00:00
import ujson as json
2022-01-29 11:25:23 +00:00
from bs4 import BeautifulSoup, Comment
2021-02-01 15:13:11 +00:00
2021-03-21 08:14:28 +00:00
from config import Config
2021-07-28 18:51:24 +00:00
from core.logger import Logger
2023-01-19 05:53:22 +00:00
from core.utils import download_to_cache
2021-02-01 15:13:11 +00:00
2022-08-21 03:22:45 +00:00
web_render = Config('web_render_local')
2023-01-19 05:53:22 +00:00
elements = ['.notaninfobox', '.portable-infobox', '.infobox', '.tpl-infobox', '.infoboxtable', '.infotemplatebox',
2023-01-29 13:11:43 +00:00
'.skin-infobox', '.arcaeabox', '.moe-infobox', '.rotable']
2023-01-19 05:53:22 +00:00
2023-01-28 05:53:11 +00:00
async def generate_screenshot_v2(page_link, section=None, allow_special_page=False, content_mode=False) -> Union[
str, bool]:
2023-01-19 05:53:22 +00:00
elements_ = elements.copy()
if not web_render:
return False
if section is None:
2023-01-27 11:20:11 +00:00
if allow_special_page and content_mode:
elements_.insert(0, '.mw-body-content')
if allow_special_page and not content_mode:
2023-01-19 05:53:22 +00:00
elements_.insert(0, '.diff')
Logger.info('[Webrender] Generating element screenshot...')
try:
return await download_to_cache(web_render + 'element_screenshot', status_code=200,
headers={'Content-Type': 'application/json'},
method="POST",
post_data=json.dumps({
'url': page_link,
'element': elements_}),
attempt=1, timeout=30,
request_private_ip=True
)
except ValueError:
traceback.print_exc()
Logger.info('[Webrender] Generation Failed.')
return False
else:
Logger.info('[Webrender] Generating section screenshot...')
try:
return await download_to_cache(web_render + 'section_screenshot', status_code=200,
headers={'Content-Type': 'application/json'},
method="POST",
post_data=json.dumps({
'url': page_link,
'section': section}),
attempt=1,
timeout=30,
request_private_ip=True
)
except ValueError:
traceback.print_exc()
Logger.info('[Webrender] Generation Failed.')
return False
2021-02-01 15:13:11 +00:00
2023-01-18 18:32:33 +00:00
async def generate_screenshot_v1(link, page_link, headers, section=None, allow_special_page=False) -> Union[str, bool]:
2022-01-29 11:25:23 +00:00
if not web_render:
2021-05-22 15:59:24 +00:00
return False
2021-02-01 15:13:11 +00:00
try:
2022-01-29 11:25:23 +00:00
Logger.info('Starting find infobox/section..')
2022-02-09 12:25:25 +00:00
if link[-1] != '/':
link += '/'
2021-02-01 15:13:11 +00:00
try:
2021-03-21 05:00:17 +00:00
async with aiohttp.ClientSession(headers=headers) as session:
2021-10-31 15:59:18 +00:00
async with session.get(page_link, timeout=aiohttp.ClientTimeout(total=20)) as req:
2021-03-01 12:08:58 +00:00
html = await req.read()
2021-02-01 15:13:11 +00:00
except:
2021-02-01 16:26:00 +00:00
traceback.print_exc()
2021-02-01 15:13:11 +00:00
return False
soup = BeautifulSoup(html, 'html.parser')
pagename = uuid.uuid4()
url = os.path.abspath(f'./cache/{pagename}.html')
if os.path.exists(url):
os.remove(url)
2021-07-28 18:51:24 +00:00
Logger.info('Downloaded raw.')
2021-05-15 15:50:17 +00:00
open_file = open(url, 'a', encoding='utf-8')
2022-08-03 12:09:14 +00:00
timeless_fix = False
2021-02-01 15:13:11 +00:00
2021-02-02 11:40:13 +00:00
def join_url(base, target):
target = target.split(' ')
targetlist = []
for x in target:
if x.find('/') != -1:
x = urljoin(base, x)
targetlist.append(x)
else:
targetlist.append(x)
target = ' '.join(targetlist)
return target
2021-02-02 09:33:55 +00:00
2022-01-30 07:09:01 +00:00
open_file.write('<!DOCTYPE html>\n')
for x in soup.find_all('html'):
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<html {" ".join(fl)}>')
open_file.write('<head>\n')
2022-01-29 11:25:23 +00:00
for x in soup.find_all(rel='stylesheet'):
if x.has_attr('href'):
2022-08-03 12:09:14 +00:00
get_herf = x.get('href')
if get_herf.find('timeless') != -1:
timeless_fix = True
x.attrs['href'] = re.sub(';', '&', urljoin(link, get_herf))
2022-01-29 11:25:23 +00:00
open_file.write(str(x))
for x in soup.find_all():
2021-02-02 11:40:13 +00:00
if x.has_attr('href'):
2022-02-09 12:25:25 +00:00
x.attrs['href'] = re.sub(';', '&', urljoin(link, x.get('href')))
open_file.write('</head>')
2022-01-29 11:25:23 +00:00
for x in soup.find_all('style'):
open_file.write(str(x))
if section is None:
2022-04-13 15:36:55 +00:00
find_diff = None
if allow_special_page:
2022-10-04 08:40:59 +00:00
find_diff = soup.find('table', class_=re.compile('diff'))
2022-04-13 15:36:55 +00:00
if find_diff is not None:
Logger.info('Found diff...')
for x in soup.find_all('body'):
if x.has_attr('class'):
open_file.write(f'<body class="{" ".join(x.get("class"))}">')
for x in soup.find_all('div'):
if x.has_attr('id'):
if x.get('id') in ['content', 'mw-content-text']:
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<div {" ".join(fl)}>')
open_file.write('<div class="mw-parser-output">')
for x in soup.find_all('main'):
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<main {" ".join(fl)}>')
open_file.write(str(find_diff))
w = 2000
if find_diff is None:
2023-01-19 05:53:22 +00:00
infoboxes = elements.copy()
2022-04-13 15:36:55 +00:00
find_infobox = None
for i in infoboxes:
2023-01-19 05:53:22 +00:00
find_infobox = soup.find(class_=i[1:])
2022-04-13 15:36:55 +00:00
if find_infobox is not None:
break
if find_infobox is None:
Logger.info('Found nothing...')
return False
else:
Logger.info('Found infobox...')
for x in find_infobox.find_all(['a', 'img', 'span']):
if x.has_attr('href'):
x.attrs['href'] = join_url(link, x.get('href'))
if x.has_attr('src'):
x.attrs['src'] = join_url(link, x.get('src'))
if x.has_attr('srcset'):
x.attrs['srcset'] = join_url(link, x.get('srcset'))
if x.has_attr('style'):
x.attrs['style'] = re.sub(r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
for x in find_infobox.find_all(class_='lazyload'):
if x.has_attr('class') and x.has_attr('data-src'):
x.attrs['class'] = 'image'
x.attrs['src'] = x.attrs['data-src']
for x in find_infobox.find_all(class_='lazyload'):
if x.has_attr('class') and x.has_attr('data-src'):
x.attrs['class'] = 'image'
x.attrs['src'] = x.attrs['data-src']
2022-08-03 11:49:54 +00:00
open_file.write('<div class="mw-parser-output">')
2022-04-13 15:36:55 +00:00
open_file.write(str(find_infobox))
w = 500
open_file.write('</div>')
2022-01-29 11:25:23 +00:00
else:
2022-01-30 07:09:01 +00:00
for x in soup.find_all('body'):
if x.has_attr('class'):
open_file.write(f'<body class="{" ".join(x.get("class"))}">')
2022-01-29 11:25:23 +00:00
for x in soup.find_all('div'):
if x.has_attr('id'):
if x.get('id') in ['content', 'mw-content-text']:
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<div {" ".join(fl)}>')
open_file.write('<div class="mw-parser-output">')
for x in soup.find_all('main'):
fl = []
for f in x.attrs:
if isinstance(x.attrs[f], str):
fl.append(f'{f}="{x.attrs[f]}"')
elif isinstance(x.attrs[f], list):
fl.append(f'{f}="{" ".join(x.attrs[f])}"')
open_file.write(f'<main {" ".join(fl)}>')
def is_comment(e):
return isinstance(e, Comment)
to_remove = soup.find_all(text=is_comment)
for element in to_remove:
element.extract()
selected = False
x = None
2022-08-03 09:23:27 +00:00
hx = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
2022-08-03 12:16:36 +00:00
selected_hx = None
2022-08-03 09:23:27 +00:00
for h in hx:
2022-01-29 11:25:23 +00:00
if selected:
break
2022-08-03 09:23:27 +00:00
for x in soup.find_all(h):
for y in x.find_all('span', id=section):
if y != '':
selected = True
2022-08-03 12:16:36 +00:00
selected_hx = h
2022-08-03 09:23:27 +00:00
break
if selected:
break
2022-01-29 11:25:23 +00:00
if not selected:
Logger.info('Found nothing...')
return False
Logger.info('Found section...')
open_file.write(str(x))
b = x
bl = []
while True:
b = b.next_sibling
if b is None:
break
2022-08-03 12:16:36 +00:00
if b.name == selected_hx:
2022-01-29 11:25:23 +00:00
break
2023-01-04 06:03:50 +00:00
if b.name in hx:
if hx.index(selected_hx) >= hx.index(b.name):
break
2022-01-29 11:25:23 +00:00
if b not in bl:
bl.append(str(b))
open_file.write(''.join(bl))
open_file.close()
open_file = open(url, 'r', encoding='utf-8')
soup = BeautifulSoup(open_file.read(), 'html.parser')
open_file.close()
for x in soup.find_all(['a', 'img', 'span']):
if x.has_attr('href'):
x.attrs['href'] = join_url(link, x.get('href'))
if x.has_attr('src'):
x.attrs['src'] = join_url(link, x.get('src'))
if x.has_attr('srcset'):
x.attrs['srcset'] = join_url(link, x.get('srcset'))
if x.has_attr('style'):
x.attrs['style'] = re.sub(r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
for x in soup.find_all(class_='lazyload'):
if x.has_attr('class') and x.has_attr('data-src'):
x.attrs['class'] = 'image'
x.attrs['src'] = x.attrs['data-src']
for x in soup.find_all(class_='lazyload'):
if x.has_attr('class') and x.has_attr('data-src'):
x.attrs['class'] = 'image'
x.attrs['src'] = x.attrs['data-src']
open_file = open(url, 'w', encoding='utf-8')
open_file.write(str(soup))
2022-01-29 11:50:14 +00:00
w = 1000
2022-01-30 07:09:01 +00:00
open_file.write('</div></body>')
open_file.write('<style>span.heimu a.external,\
2022-01-29 11:25:23 +00:00
span.heimu a.external:visited,\
span.heimu a.extiw,\
span.heimu a.extiw:visited {\
color: #252525;\
}\
.heimu,\
.heimu a,\
a .heimu,\
.heimu a.new {\
background-color: #cccccc;\
text-shadow: none;\
}</style>')
2022-08-03 12:09:14 +00:00
if timeless_fix:
open_file.write('<style>body {\
background: white!important}</style>')
2022-01-30 07:09:01 +00:00
open_file.write('</html>')
2021-05-15 15:50:17 +00:00
open_file.close()
2021-05-16 15:15:07 +00:00
read_file = open(url, 'r', encoding='utf-8')
2023-01-19 14:48:10 +00:00
html = {'content': read_file.read(), 'width': w, 'mw': True}
2021-07-28 18:51:24 +00:00
Logger.info('Start rendering...')
2021-02-01 15:13:11 +00:00
picname = os.path.abspath(f'./cache/{pagename}.jpg')
if os.path.exists(picname):
os.remove(picname)
2021-04-02 16:35:26 +00:00
async with aiohttp.ClientSession() as session:
2021-11-08 16:09:06 +00:00
async with session.post(web_render, headers={
2021-04-02 16:35:26 +00:00
'Content-Type': 'application/json',
}, data=json.dumps(html)) as resp:
with open(picname, 'wb+') as jpg:
jpg.write(await resp.read())
2021-02-01 15:13:11 +00:00
return picname
except Exception:
traceback.print_exc()
return False