Archived
1
0
Fork 0
This repository has been archived on 2024-04-26. You can view files and clone it, but cannot push or open issues or pull requests.
akari-bot/modules/wiki/getinfobox.py

112 lines
4.1 KiB
Python
Raw Normal View History

2021-02-01 15:13:11 +00:00
import json
import os
import re
import traceback
import uuid
from urllib.parse import urljoin
2021-02-01 15:13:11 +00:00
import aiohttp
from bs4 import BeautifulSoup
2021-03-21 08:14:28 +00:00
from config import Config
2021-04-03 14:29:11 +00:00
from core.template import logger_info
2021-02-01 15:13:11 +00:00
2021-04-08 12:50:42 +00:00
infobox_render = Config('infobox_render')
2021-02-01 15:13:11 +00:00
async def get_infobox_pic(link, pagelink, headers):
2021-02-01 15:13:11 +00:00
try:
2021-04-03 14:29:11 +00:00
logger_info('Starting find infobox..')
2021-02-02 09:57:56 +00:00
wlink = re.sub(r'api.php', '', link)
2021-02-01 17:20:45 +00:00
link = re.sub(r'(?:w/|)api.php', '', link)
2021-02-01 15:13:11 +00:00
try:
2021-03-21 05:00:17 +00:00
async with aiohttp.ClientSession(headers=headers) as session:
2021-03-01 12:08:58 +00:00
async with session.get(pagelink, timeout=aiohttp.ClientTimeout(total=20)) as req:
html = await req.read()
2021-02-01 15:13:11 +00:00
except:
2021-02-01 16:26:00 +00:00
traceback.print_exc()
2021-02-01 15:13:11 +00:00
return False
soup = BeautifulSoup(html, 'html.parser')
pagename = uuid.uuid4()
url = os.path.abspath(f'./cache/{pagename}.html')
if os.path.exists(url):
os.remove(url)
2021-04-03 14:29:11 +00:00
logger_info('Downloaded raw.')
find_infobox = soup.find(class_='notaninfobox') # 我
2021-02-05 07:57:59 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='portable-infobox') # 找
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='infobox') # 找
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='tpl-infobox') # 找
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='infoboxtable') # 找
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='infotemplatebox') # 找
2021-03-23 15:00:45 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='skin-infobox') # 找
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
find_infobox = soup.find(class_='wikitable songtable') # 找 (arcw)
2021-02-14 15:25:39 +00:00
if find_infobox is None: # 找
return False # 找你妈,不找了<-咱还是回家吧
2021-04-03 14:29:11 +00:00
logger_info('Find infobox, start modding...')
2021-02-01 15:13:11 +00:00
if infobox_render is None:
open_file = open(url, 'a', encoding='utf-8')
else:
html_list = []
for x in soup.find_all(rel='stylesheet'):
y = str(x.get('href'))
2021-02-02 11:40:13 +00:00
z = urljoin(wlink, y)
2021-02-02 09:33:55 +00:00
z = re.sub(';', '&', z)
2021-02-01 15:13:11 +00:00
if infobox_render is None:
open_file.write(f'<link href="{z}" rel="stylesheet"/>\n')
else:
html_list.append(f'<link href="{z}" rel="stylesheet"/>\n')
2021-02-02 11:40:13 +00:00
def join_url(base, target):
target = target.split(' ')
targetlist = []
for x in target:
if x.find('/') != -1:
x = urljoin(base, x)
targetlist.append(x)
else:
targetlist.append(x)
target = ' '.join(targetlist)
return target
2021-02-02 09:33:55 +00:00
2021-02-02 11:40:13 +00:00
for x in find_infobox.find_all(['a', 'img', 'span']):
if x.has_attr('href'):
x.attrs['href'] = join_url(link, x.get('href'))
if x.has_attr('src'):
2021-02-02 11:40:13 +00:00
x.attrs['src'] = join_url(link, x.get('src'))
if x.has_attr('srcset'):
2021-02-02 11:40:13 +00:00
x.attrs['srcset'] = join_url(link, x.get('srcset'))
if x.has_attr('style'):
2021-02-02 11:40:13 +00:00
x.attrs['style'] = re.sub(r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
replace_link = find_infobox
2021-02-01 15:13:11 +00:00
if infobox_render is None:
open_file.write(str(replace_link))
open_file.close()
else:
html_list.append(str(replace_link))
2021-02-02 09:33:55 +00:00
html = '\n'.join(html_list)
html = {'content': html}
2021-04-08 12:50:42 +00:00
logger_info('Start rendering...')
2021-02-01 15:13:11 +00:00
picname = os.path.abspath(f'./cache/{pagename}.jpg')
if os.path.exists(picname):
os.remove(picname)
2021-04-02 16:35:26 +00:00
async with aiohttp.ClientSession() as session:
async with session.post(infobox_render, headers={
'Content-Type': 'application/json',
}, data=json.dumps(html)) as resp:
with open(picname, 'wb+') as jpg:
jpg.write(await resp.read())
2021-02-01 15:13:11 +00:00
return picname
except Exception:
traceback.print_exc()
return False