akari-bot/modules/wiki/getinfobox.py

import json
import os
import re
import traceback
import uuid
from urllib.parse import urljoin

import aiohttp
from bs4 import BeautifulSoup

from config import Config
from core.template import logger_info

infobox_render = Config('infobox_render')


async def get_infobox_pic(link, pagelink, headers):
    try:
        logger_info('Starting find infobox..')
        wlink = re.sub(r'api.php', '', link)
        link = re.sub(r'(?:w/|)api.php', '', link)
        try:
            async with aiohttp.ClientSession(headers=headers) as session:
                async with session.get(pagelink, timeout=aiohttp.ClientTimeout(total=20)) as req:
                    html = await req.read()
        except:
            traceback.print_exc()
            return False
        soup = BeautifulSoup(html, 'html.parser')
        pagename = uuid.uuid4()
        url = os.path.abspath(f'./cache/{pagename}.html')
        if os.path.exists(url):
            os.remove(url)
        logger_info('Downloaded raw.')
        find_infobox = soup.find(class_='notaninfobox')  # 我
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='portable-infobox')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='infobox')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='tpl-infobox')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='infoboxtable')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='infotemplatebox')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='skin-infobox')  # 找
        if find_infobox is None:  # 找
            find_infobox = soup.find(class_='wikitable songtable')  # 找 (arcw)
        if find_infobox is None:  # 找
            return False  # 找你妈，不找了<-咱还是回家吧
        logger_info('Find infobox, start modding...')

        if infobox_render is None:
            open_file = open(url, 'a', encoding='utf-8')
        else:
            html_list = []

        for x in soup.find_all(rel='stylesheet'):
            y = str(x.get('href'))
            z = urljoin(wlink, y)
            z = re.sub(';', '&', z)
            if infobox_render is None:
                open_file.write(f'<link href="{z}" rel="stylesheet"/>\n')
            else:
                html_list.append(f'<link href="{z}" rel="stylesheet"/>\n')

        def join_url(base, target):
            target = target.split(' ')
            targetlist = []
            for x in target:
                if x.find('/') != -1:
                    x = urljoin(base, x)
                    targetlist.append(x)
                else:
                    targetlist.append(x)
            target = ' '.join(targetlist)
            return target

        for x in find_infobox.find_all(['a', 'img', 'span']):
            if x.has_attr('href'):
                x.attrs['href'] = join_url(link, x.get('href'))
            if x.has_attr('src'):
                x.attrs['src'] = join_url(link, x.get('src'))
            if x.has_attr('srcset'):
                x.attrs['srcset'] = join_url(link, x.get('srcset'))
            if x.has_attr('style'):
                x.attrs['style'] = re.sub(r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))
        replace_link = find_infobox

        if infobox_render is None:
            open_file.write(str(replace_link))
            open_file.close()
        else:
            html_list.append(str(replace_link))
            html = '\n'.join(html_list)
            html = {'content': html}
        logger_info('Start rendering...')
        picname = os.path.abspath(f'./cache/{pagename}.jpg')
        if os.path.exists(picname):
            os.remove(picname)
        async with aiohttp.ClientSession() as session:
            async with session.post(infobox_render, headers={
                'Content-Type': 'application/json',
            }, data=json.dumps(html)) as resp:
                with open(picname, 'wb+') as jpg:
                    jpg.write(await resp.read())
        return picname
    except Exception:
        traceback.print_exc()
        return False
v3 2021-02-01 15:13:11 +00:00			`import json`
			`import os`
			`import re`
			`import traceback`
			`import uuid`
remove unused module help and refactor user 2021-02-09 13:05:19 +00:00			`from urllib.parse import urljoin`
v3 2021-02-01 15:13:11 +00:00
			`import aiohttp`
			`from bs4 import BeautifulSoup`

update database 2021-03-21 08:14:28 +00:00			`from config import Config`
bugfix 2021-04-03 14:29:11 +00:00			`from core.template import logger_info`
v3 2021-02-01 15:13:11 +00:00
improve import 2021-04-08 12:50:42 +00:00			`infobox_render = Config('infobox_render')`
v3 2021-02-01 15:13:11 +00:00

Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`async def get_infobox_pic(link, pagelink, headers):`
v3 2021-02-01 15:13:11 +00:00			`try:`
bugfix 2021-04-03 14:29:11 +00:00			`logger_info('Starting find infobox..')`
fix 2021-02-02 09:57:56 +00:00			`wlink = re.sub(r'api.php', '', link)`
bugfix 2021-02-01 17:20:45 +00:00			`link = re.sub(r'(?:w/\|)api.php', '', link)`
v3 2021-02-01 15:13:11 +00:00			`try:`
headers support 2021-03-21 05:00:17 +00:00			`async with aiohttp.ClientSession(headers=headers) as session:`
check extension 2021-03-01 12:08:58 +00:00			`async with session.get(pagelink, timeout=aiohttp.ClientTimeout(total=20)) as req:`
			`html = await req.read()`
v3 2021-02-01 15:13:11 +00:00			`except:`
baka 2021-02-01 16:26:00 +00:00			`traceback.print_exc()`
v3 2021-02-01 15:13:11 +00:00			`return False`
			`soup = BeautifulSoup(html, 'html.parser')`
			`pagename = uuid.uuid4()`
			`url = os.path.abspath(f'./cache/{pagename}.html')`
			`if os.path.exists(url):`
			`os.remove(url)`
bugfix 2021-04-03 14:29:11 +00:00			`logger_info('Downloaded raw.')`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='notaninfobox') # 我`
bugfix 2021-02-05 07:57:59 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='portable-infobox') # 找`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='infobox') # 找`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='tpl-infobox') # 找`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='infoboxtable') # 找`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='infotemplatebox') # 找`
Update getinfobox.py 2021-03-23 15:00:45 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='skin-infobox') # 找`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`find_infobox = soup.find(class_='wikitable songtable') # 找 (arcw)`
wikidoc support 2021-02-14 15:25:39 +00:00			`if find_infobox is None: # 找`
			`return False # 找你妈，不找了<-咱还是回家吧`
bugfix 2021-04-03 14:29:11 +00:00			`logger_info('Find infobox, start modding...')`
v3 2021-02-01 15:13:11 +00:00
			`if infobox_render is None:`
			`open_file = open(url, 'a', encoding='utf-8')`
			`else:`
			`html_list = []`

			`for x in soup.find_all(rel='stylesheet'):`
			`y = str(x.get('href'))`
bugfix 2021-02-02 11:40:13 +00:00			`z = urljoin(wlink, y)`
fix 2021-02-02 09:33:55 +00:00			`z = re.sub(';', '&', z)`
v3 2021-02-01 15:13:11 +00:00			`if infobox_render is None:`
			`open_file.write(f'<link href="{z}" rel="stylesheet"/>\n')`
			`else:`
			`html_list.append(f'<link href="{z}" rel="stylesheet"/>\n')`

bugfix 2021-02-02 11:40:13 +00:00			`def join_url(base, target):`
			`target = target.split(' ')`
			`targetlist = []`
			`for x in target:`
			`if x.find('/') != -1:`
			`x = urljoin(base, x)`
			`targetlist.append(x)`
			`else:`
			`targetlist.append(x)`
			`target = ' '.join(targetlist)`
			`return target`
fix 2021-02-02 09:33:55 +00:00
bugfix 2021-02-02 11:40:13 +00:00			`for x in find_infobox.find_all(['a', 'img', 'span']):`
			`if x.has_attr('href'):`
			`x.attrs['href'] = join_url(link, x.get('href'))`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`if x.has_attr('src'):`
bugfix 2021-02-02 11:40:13 +00:00			`x.attrs['src'] = join_url(link, x.get('src'))`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`if x.has_attr('srcset'):`
bugfix 2021-02-02 11:40:13 +00:00			`x.attrs['srcset'] = join_url(link, x.get('srcset'))`
Revert "🤡" This reverts commit 3ae933717123bfbcbe08e5ca7aa62cd611a28218. 2021-04-01 15:58:02 +00:00			`if x.has_attr('style'):`
bugfix 2021-02-02 11:40:13 +00:00			`x.attrs['style'] = re.sub(r'url\(/(.*)\)', 'url(' + link + '\\1)', x.get('style'))`
			`replace_link = find_infobox`
v3 2021-02-01 15:13:11 +00:00
			`if infobox_render is None:`
			`open_file.write(str(replace_link))`
			`open_file.close()`
			`else:`
			`html_list.append(str(replace_link))`
fix 2021-02-02 09:33:55 +00:00			`html = '\n'.join(html_list)`
			`html = {'content': html}`
improve import 2021-04-08 12:50:42 +00:00			`logger_info('Start rendering...')`
v3 2021-02-01 15:13:11 +00:00			`picname = os.path.abspath(f'./cache/{pagename}.jpg')`
			`if os.path.exists(picname):`
			`os.remove(picname)`
refactor loader & bugfix 2021-04-02 16:35:26 +00:00			`async with aiohttp.ClientSession() as session:`
			`async with session.post(infobox_render, headers={`
			`'Content-Type': 'application/json',`
			`}, data=json.dumps(html)) as resp:`
			`with open(picname, 'wb+') as jpg:`
			`jpg.write(await resp.read())`
v3 2021-02-01 15:13:11 +00:00			`return picname`
			`except Exception:`
			`traceback.print_exc()`
			`return False`