2021-08-07 07:56:48 +00:00
|
|
|
|
import asyncio
|
2021-04-18 12:25:25 +00:00
|
|
|
|
import datetime
|
2020-06-13 12:43:43 +00:00
|
|
|
|
import re
|
2020-08-01 03:25:34 +00:00
|
|
|
|
import traceback
|
2021-04-17 15:17:29 +00:00
|
|
|
|
import urllib.parse
|
2020-08-12 16:01:34 +00:00
|
|
|
|
|
2020-09-05 09:51:43 +00:00
|
|
|
|
import aiohttp
|
2021-08-07 07:56:48 +00:00
|
|
|
|
import html2text
|
2021-09-10 18:05:27 +00:00
|
|
|
|
import ujson as json
|
2020-09-05 09:51:43 +00:00
|
|
|
|
|
2021-02-19 11:26:19 +00:00
|
|
|
|
from core import dirty_check
|
2021-08-24 05:11:39 +00:00
|
|
|
|
from core.logger import Logger
|
2021-07-28 18:51:24 +00:00
|
|
|
|
from .dbutils import WikiSiteInfo
|
|
|
|
|
|
2020-08-12 16:01:34 +00:00
|
|
|
|
|
2021-02-01 15:13:11 +00:00
|
|
|
|
class wikilib:
|
2021-04-25 14:10:03 +00:00
|
|
|
|
async def get_data(self, url: str, fmt: str, headers=None, ignore_err=False):
|
2021-08-20 10:52:39 +00:00
|
|
|
|
print(url)
|
2021-03-21 05:00:17 +00:00
|
|
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
2021-04-25 14:10:03 +00:00
|
|
|
|
async with session.get(url, timeout=aiohttp.ClientTimeout(total=20)) as req:
|
|
|
|
|
if req.status == 200 or ignore_err:
|
|
|
|
|
if fmt == 'json':
|
|
|
|
|
try:
|
|
|
|
|
return json.loads(await req.text())
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
return json.loads(await req.text(encoding='unicode-escape'))
|
2020-10-27 15:48:41 +00:00
|
|
|
|
else:
|
2021-04-25 14:10:03 +00:00
|
|
|
|
try:
|
|
|
|
|
return await req.text()
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
return await req.text(encoding='unicode-escape')
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(req.status)
|
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
def encode_query_string(self, kwargs: dict):
|
|
|
|
|
return '?' + urllib.parse.urlencode(kwargs)
|
|
|
|
|
|
2021-04-25 14:10:03 +00:00
|
|
|
|
async def check_wiki_available(self, link, headers=None):
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query', 'meta': 'siteinfo',
|
|
|
|
|
'siprop': 'general|namespaces|namespacealiases|interwikimap|extensions', 'format': 'json'}
|
|
|
|
|
query = self.encode_query_string(query_string)
|
|
|
|
|
try:
|
|
|
|
|
api = re.match(r'(https?://.*?/api.php$)', link)
|
|
|
|
|
wlink = api.group(1)
|
2021-04-25 14:10:03 +00:00
|
|
|
|
json1 = json.loads(await self.get_data(api.group(1) + query, 'json', headers=headers))
|
2021-08-24 05:11:39 +00:00
|
|
|
|
getcacheinfo = WikiSiteInfo(wlink).get()
|
|
|
|
|
if getcacheinfo and datetime.datetime.now().timestamp() - getcacheinfo[1].timestamp() < 43200:
|
|
|
|
|
return wlink, json.loads(getcacheinfo[0])['query']['general']['sitename']
|
2021-04-20 16:32:53 +00:00
|
|
|
|
except:
|
|
|
|
|
try:
|
2021-04-25 14:10:03 +00:00
|
|
|
|
getpage = await self.get_data(link, 'text', headers=headers, ignore_err=True)
|
2021-08-20 11:13:33 +00:00
|
|
|
|
print(getpage)
|
2021-08-25 12:57:20 +00:00
|
|
|
|
if getpage.find('<title>Attention Required! | Cloudflare</title>') != -1:
|
|
|
|
|
return False, 'CloudFlare拦截了机器人的请求,请联系站点管理员解决此问题。'
|
2021-04-20 16:32:53 +00:00
|
|
|
|
m = re.findall(
|
|
|
|
|
r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
|
|
|
|
|
getpage)
|
|
|
|
|
api = m[0]
|
|
|
|
|
if api.startswith('//'):
|
|
|
|
|
api = link.split('//')[0] + api
|
2021-08-24 05:11:39 +00:00
|
|
|
|
Logger.info(api)
|
2021-08-24 10:22:36 +00:00
|
|
|
|
getcacheinfo = WikiSiteInfo(api).get()
|
2021-07-28 18:51:24 +00:00
|
|
|
|
if getcacheinfo and datetime.datetime.now().timestamp() - getcacheinfo[1].timestamp() < 43200:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
return api, json.loads(getcacheinfo[0])['query']['general']['sitename']
|
2021-04-25 14:10:03 +00:00
|
|
|
|
json1 = await self.get_data(api + query, 'json', headers=headers)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
wlink = api
|
2021-04-25 14:10:03 +00:00
|
|
|
|
except TimeoutError:
|
|
|
|
|
traceback.print_exc()
|
2021-07-28 18:51:24 +00:00
|
|
|
|
return False, '错误:尝试建立连接超时。'
|
2021-04-20 16:32:53 +00:00
|
|
|
|
except Exception as e:
|
2021-04-25 14:10:03 +00:00
|
|
|
|
traceback.print_exc()
|
2021-08-07 07:56:48 +00:00
|
|
|
|
if e.args == (403,):
|
2021-04-25 14:10:03 +00:00
|
|
|
|
return False, '服务器拒绝了机器人的请求。'
|
2021-07-28 18:51:24 +00:00
|
|
|
|
elif not re.match(r'^(https?://).*', link):
|
|
|
|
|
return False, '所给的链接没有指明协议头(链接应以http://或https://开头)。'
|
2021-04-25 14:10:03 +00:00
|
|
|
|
else:
|
2021-07-28 18:51:24 +00:00
|
|
|
|
return False, '此站点也许不是一个有效的Mediawiki:' + str(e)
|
2021-08-24 10:22:36 +00:00
|
|
|
|
WikiSiteInfo(wlink).update(json1)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
wikiname = json1['query']['general']['sitename']
|
|
|
|
|
extensions = json1['query']['extensions']
|
|
|
|
|
extlist = []
|
|
|
|
|
for ext in extensions:
|
|
|
|
|
extlist.append(ext['name'])
|
|
|
|
|
if 'TextExtracts' not in extlist:
|
|
|
|
|
wikiname = wikiname + '\n警告:此wiki没有启用TextExtracts扩展,返回的页面预览内容将为未处理的原始Wikitext文本。'
|
|
|
|
|
|
|
|
|
|
return wlink, wikiname
|
|
|
|
|
|
2021-02-01 15:13:11 +00:00
|
|
|
|
def danger_wiki_check(self):
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('WIKIPEDIA') != -1:
|
2021-02-01 15:13:11 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('UNCYCLOPEDIA') != -1:
|
2021-02-02 11:40:13 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('HMOEGIRL') != -1:
|
2021-02-02 11:40:13 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('EVCHK') != -1:
|
2021-02-02 11:40:13 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('HONGKONG.FANDOM') != -1:
|
2021-02-02 11:40:13 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('WIKILEAKS') != -1:
|
2021-02-02 11:40:13 +00:00
|
|
|
|
return True
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.wiki_api_endpoint.upper().find('NANFANGGONGYUAN') != -1:
|
2021-02-04 08:25:13 +00:00
|
|
|
|
return True
|
2021-02-01 15:13:11 +00:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
async def danger_text_check(self, text):
|
|
|
|
|
if not self.danger_wiki_check():
|
|
|
|
|
return False
|
2021-02-19 11:26:19 +00:00
|
|
|
|
check = await dirty_check.check(text)
|
2021-02-01 15:13:11 +00:00
|
|
|
|
print(check)
|
|
|
|
|
if check.find('<吃掉了>') != -1 or check.find('<全部吃掉了>') != -1:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
2021-03-21 12:33:05 +00:00
|
|
|
|
async def random_page(self, url, iw=None, headers=None):
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query',
|
|
|
|
|
'list': 'random',
|
|
|
|
|
'format': 'json'}
|
|
|
|
|
random_url = url + self.encode_query_string(query_string)
|
2021-03-21 12:33:05 +00:00
|
|
|
|
json = await self.get_data(random_url, 'json')
|
|
|
|
|
randompage = json['query']['random'][0]['title']
|
2021-03-21 12:41:07 +00:00
|
|
|
|
return await self.main(url, randompage, interwiki=iw, headers=headers)
|
2021-03-21 12:33:05 +00:00
|
|
|
|
|
2021-04-18 12:25:25 +00:00
|
|
|
|
async def get_wiki_info(self, url=None):
|
2021-04-20 16:32:53 +00:00
|
|
|
|
url = url if url is not None else self.wiki_api_endpoint
|
2021-07-28 18:51:24 +00:00
|
|
|
|
getcacheinfo = WikiSiteInfo(url).get()
|
|
|
|
|
if getcacheinfo and datetime.datetime.now().timestamp() - getcacheinfo[1].timestamp() < 43200:
|
2021-04-18 12:25:25 +00:00
|
|
|
|
return json.loads(getcacheinfo[0])
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query', 'meta': 'siteinfo',
|
|
|
|
|
'siprop': 'general|namespaces|namespacealiases|interwikimap|extensions', 'format': 'json'}
|
|
|
|
|
wiki_info_url = url + self.encode_query_string(query_string)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
j = await self.get_data(wiki_info_url, 'json')
|
2021-07-28 18:51:24 +00:00
|
|
|
|
WikiSiteInfo(url).update(j)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
return j
|
|
|
|
|
|
2021-04-25 14:10:03 +00:00
|
|
|
|
async def get_interwiki(self, url=None, iw=None):
|
|
|
|
|
print(url)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
if url is None:
|
|
|
|
|
json = self.wiki_info
|
|
|
|
|
else:
|
|
|
|
|
json = await self.get_wiki_info(url)
|
2021-02-01 15:13:11 +00:00
|
|
|
|
interwikimap = json['query']['interwikimap']
|
|
|
|
|
interwiki_dict = {}
|
2021-04-25 14:10:03 +00:00
|
|
|
|
if iw is None:
|
|
|
|
|
for interwiki in interwikimap:
|
2021-05-02 15:29:27 +00:00
|
|
|
|
interwiki_dict[interwiki['prefix']] = re.sub(r'\$1$', '', interwiki['url'])
|
2021-04-25 14:10:03 +00:00
|
|
|
|
else:
|
|
|
|
|
if iw in interwikimap:
|
|
|
|
|
interwiki_dict[iw] = interwikimap[iw]['url']
|
2021-02-01 15:13:11 +00:00
|
|
|
|
return interwiki_dict
|
|
|
|
|
|
2021-02-14 16:34:04 +00:00
|
|
|
|
async def get_namespace(self, url=None):
|
2021-04-18 12:25:25 +00:00
|
|
|
|
if url is None:
|
|
|
|
|
j = self.wiki_info
|
|
|
|
|
else:
|
|
|
|
|
j = await self.get_wiki_info(url)
|
2021-02-14 16:34:04 +00:00
|
|
|
|
d = {}
|
|
|
|
|
for x in j['query']['namespaces']:
|
|
|
|
|
try:
|
|
|
|
|
d[j['query']['namespaces'][x]['*']] = j['query']['namespaces'][x]['canonical']
|
2021-04-18 12:25:25 +00:00
|
|
|
|
except KeyError:
|
|
|
|
|
pass
|
2021-02-14 16:34:04 +00:00
|
|
|
|
except:
|
|
|
|
|
traceback.print_exc()
|
2021-04-02 16:35:26 +00:00
|
|
|
|
for x in j['query']['namespacealiases']:
|
|
|
|
|
try:
|
2021-04-02 16:46:37 +00:00
|
|
|
|
d[x['*']] = 'aliases'
|
2021-04-18 12:25:25 +00:00
|
|
|
|
except KeyError:
|
|
|
|
|
pass
|
2021-04-02 16:35:26 +00:00
|
|
|
|
except:
|
|
|
|
|
traceback.print_exc()
|
2021-02-14 16:34:04 +00:00
|
|
|
|
return d
|
|
|
|
|
|
2021-04-18 12:25:25 +00:00
|
|
|
|
async def get_article_path(self, url=None):
|
|
|
|
|
if url is None:
|
|
|
|
|
wiki_info = self.wiki_info
|
2021-04-20 16:32:53 +00:00
|
|
|
|
url = self.wiki_api_endpoint
|
2021-04-18 12:25:25 +00:00
|
|
|
|
else:
|
|
|
|
|
wiki_info = await self.get_wiki_info(url)
|
|
|
|
|
if not wiki_info:
|
2021-04-18 14:42:44 +00:00
|
|
|
|
return False
|
2021-04-18 12:25:25 +00:00
|
|
|
|
article_path = wiki_info['query']['general']['articlepath']
|
2021-05-02 15:29:27 +00:00
|
|
|
|
article_path = re.sub(r'\$1$', '', article_path)
|
2021-04-21 14:01:50 +00:00
|
|
|
|
print(url)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
base_url = re.match(r'(https?://.*?)/.*', url)
|
|
|
|
|
return base_url.group(1) + article_path
|
2021-02-09 15:10:51 +00:00
|
|
|
|
|
2021-04-18 14:42:44 +00:00
|
|
|
|
async def get_enabled_extensions(self, url=None):
|
|
|
|
|
if url is None:
|
|
|
|
|
wiki_info = self.wiki_info
|
|
|
|
|
else:
|
|
|
|
|
wiki_info = await self.get_wiki_info(url)
|
|
|
|
|
extensions = wiki_info['query']['extensions']
|
2021-04-20 16:32:53 +00:00
|
|
|
|
ext_list = []
|
2021-04-18 14:42:44 +00:00
|
|
|
|
for ext in extensions:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
ext_list.append(ext['name'])
|
|
|
|
|
return ext_list
|
2021-04-18 14:42:44 +00:00
|
|
|
|
|
2021-04-19 17:00:54 +00:00
|
|
|
|
async def get_real_address(self, url=None):
|
|
|
|
|
if url is None:
|
|
|
|
|
wiki_info = self.wiki_info
|
|
|
|
|
else:
|
|
|
|
|
wiki_info = await self.get_wiki_info(url)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
real_url = wiki_info['query']['general']['server']
|
2021-04-21 14:04:40 +00:00
|
|
|
|
if real_url.startswith('//'):
|
|
|
|
|
real_url = self.wiki_api_endpoint.split('//')[0] + real_url
|
2021-04-20 16:32:53 +00:00
|
|
|
|
return real_url
|
2021-04-18 14:42:44 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def get_image(self, page_name, wiki_api_endpoint=None):
|
2021-02-01 15:13:11 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query', 'titles': page_name, 'prop': 'imageinfo', 'iiprop': 'url',
|
|
|
|
|
'format': 'json'}
|
|
|
|
|
url = (
|
|
|
|
|
wiki_api_endpoint if wiki_api_endpoint is not None else self.wiki_api_endpoint) + self.encode_query_string(
|
|
|
|
|
query_string)
|
|
|
|
|
json_ = await self.get_data(url, 'json')
|
|
|
|
|
parse_page_id = self.parse_page_id(json_)
|
|
|
|
|
image_link = json_['query']['pages'][parse_page_id]['imageinfo'][0]['url']
|
|
|
|
|
return image_link
|
2021-02-01 15:13:11 +00:00
|
|
|
|
except:
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
return False
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def get_page_link(self, page_name=None):
|
|
|
|
|
page_name = page_name if page_name is not None else self.page_name
|
|
|
|
|
page_name = re.sub('(.*)\?.*$', '\\1', page_name)
|
|
|
|
|
query_string = {'action': 'query', 'format': 'json', 'prop': 'info', 'inprop': 'url', 'redirects': 'True',
|
|
|
|
|
'titles': page_name}
|
|
|
|
|
get_link_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
|
|
|
|
|
get_page = await self.get_data(get_link_url, "json")
|
|
|
|
|
return get_page
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
def parse_page_id(self, page_raw):
|
|
|
|
|
page_raw = page_raw['query']['pages']
|
|
|
|
|
page_list = iter(page_raw)
|
|
|
|
|
page_id = page_list.__next__()
|
|
|
|
|
return page_id
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def research_page(self):
|
2020-09-09 12:16:01 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query', 'list': 'search', 'srsearch': self.page_name, 'srwhat': 'text',
|
|
|
|
|
'srlimit': '1', 'srenablerewrites': '', 'format': 'json'}
|
|
|
|
|
search_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
|
|
|
|
|
get_sec_json = await self.get_data(search_url, "json", self.headers)
|
|
|
|
|
sec_title = get_sec_json['query']['search'][0]['title']
|
2020-10-27 15:48:41 +00:00
|
|
|
|
if self.interwiki == '':
|
|
|
|
|
target = ''
|
|
|
|
|
else:
|
|
|
|
|
target = f'{self.interwiki}:'
|
2021-05-22 08:17:46 +00:00
|
|
|
|
prompt = f'找不到{target}{self.page_name},您是否要找的是:[{target}{sec_title}]?'
|
2021-04-20 16:32:53 +00:00
|
|
|
|
title_split = self.page_name.split(':')
|
|
|
|
|
if len(title_split) > 1:
|
2021-03-20 12:33:55 +00:00
|
|
|
|
try:
|
|
|
|
|
get_namespace = await self.get_namespace()
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if title_split[0] not in get_namespace:
|
|
|
|
|
prompt += f'\n提示:此Wiki上找不到“{title_split[0]}”名字空间,请检查是否设置了对应的Interwiki(使用~wiki iw list命令可以查询当前已设置的Interwiki)。'
|
2021-03-20 12:33:55 +00:00
|
|
|
|
except:
|
|
|
|
|
traceback.print_exc()
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.template_prompt:
|
|
|
|
|
prompt = self.template_prompt + prompt
|
2021-02-01 15:13:11 +00:00
|
|
|
|
if await self.danger_text_check(prompt):
|
2021-02-01 17:20:45 +00:00
|
|
|
|
return {'status': 'done', 'text': 'https://wdf.ink/6OUp'}
|
2021-04-20 16:32:53 +00:00
|
|
|
|
return {'status': 'wait', 'title': f'{target}{sec_title}', 'text': prompt}
|
2020-10-27 15:48:41 +00:00
|
|
|
|
except Exception:
|
2021-02-14 16:34:04 +00:00
|
|
|
|
traceback.print_exc()
|
|
|
|
|
return {'status': 'done', 'text': '找不到条目。'}
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def page_not_found(self):
|
2020-10-27 15:48:41 +00:00
|
|
|
|
if 'invalid' in self.psepgraw:
|
|
|
|
|
rs1 = re.sub('The requested page title contains invalid characters:', '请求的页面标题包含非法字符:',
|
|
|
|
|
self.psepgraw['invalidreason'])
|
|
|
|
|
rs = '发生错误:“' + rs1 + '”。'
|
|
|
|
|
rs = re.sub('".”', '"”', rs)
|
2021-02-01 15:13:11 +00:00
|
|
|
|
return {'status': 'done', 'text': rs}
|
2020-10-27 15:48:41 +00:00
|
|
|
|
if 'missing' in self.psepgraw:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
self.rspt = await self.research_page()
|
2020-10-27 15:48:41 +00:00
|
|
|
|
return self.rspt
|
2021-04-20 16:32:53 +00:00
|
|
|
|
msg = await self.get_article_path(self.wiki_api_endpoint) + urllib.parse.quote(self.page_name.encode('UTF-8'))
|
2021-02-01 15:13:11 +00:00
|
|
|
|
return {'status': 'done', 'text': msg}
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def get_desc(self):
|
2020-10-27 15:48:41 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'query', 'prop': 'info|pageprops|extracts',
|
|
|
|
|
'ppprop': 'description|displaytitle|disambiguation|infoboxes', 'explaintext': 'true',
|
|
|
|
|
'exsectionformat': 'plain', 'exchars': '200', 'format': 'json',
|
|
|
|
|
'titles': self.query_text_name}
|
|
|
|
|
desc_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
|
|
|
|
|
load_text = await self.get_data(desc_url, "json", self.headers)
|
|
|
|
|
page_id = self.parse_page_id(load_text)
|
|
|
|
|
desc = load_text['query']['pages'][page_id]['extract'].split('\n')
|
|
|
|
|
desc_list = []
|
2021-04-19 05:08:46 +00:00
|
|
|
|
for x in desc:
|
|
|
|
|
if x != '':
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc_list.append(x)
|
|
|
|
|
desc = '\n'.join(desc_list)
|
2021-05-05 04:04:01 +00:00
|
|
|
|
desc_end = re.findall(r'(.*?(?:!\s|\?\s|\.\s|!|?|。)).*', desc, re.S | re.M)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if desc_end:
|
|
|
|
|
desc = desc_end[0]
|
2020-09-09 12:16:01 +00:00
|
|
|
|
except Exception:
|
2020-10-28 15:27:36 +00:00
|
|
|
|
traceback.print_exc()
|
2020-10-27 15:48:41 +00:00
|
|
|
|
desc = ''
|
2021-05-05 04:43:33 +00:00
|
|
|
|
if desc in ['...', '…']:
|
2021-04-20 14:34:13 +00:00
|
|
|
|
desc = ''
|
2020-10-27 15:48:41 +00:00
|
|
|
|
return desc
|
2020-09-05 09:51:43 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def get_first_line(self):
|
2020-10-27 15:48:41 +00:00
|
|
|
|
try:
|
2021-06-01 12:43:31 +00:00
|
|
|
|
query_string = {'action': 'parse', 'page': self.query_text_name, 'prop': 'text',
|
2021-04-20 16:32:53 +00:00
|
|
|
|
'format': 'json'}
|
|
|
|
|
desc_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
|
|
|
|
|
load_desc = await self.get_data(desc_url, 'json', self.headers)
|
2021-06-04 14:23:29 +00:00
|
|
|
|
h = html2text.HTML2Text()
|
|
|
|
|
h.ignore_links = True
|
|
|
|
|
h.ignore_images = True
|
|
|
|
|
h.ignore_tables = True
|
|
|
|
|
desc_raw = h.handle(load_desc['parse']['text']['*']).split('\n')
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc_list = []
|
|
|
|
|
for x in desc_raw:
|
2021-04-19 05:08:46 +00:00
|
|
|
|
if x != '':
|
2021-06-04 14:23:29 +00:00
|
|
|
|
if x not in desc_list:
|
|
|
|
|
desc_list.append(x)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc_raw = '\n'.join(desc_list)
|
2021-05-05 04:04:01 +00:00
|
|
|
|
cut_desc = re.findall(r'(.*?(?:!\s|\?\s|\.\s|!|?|。)).*', desc_raw, re.S | re.M)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if cut_desc:
|
|
|
|
|
desc = cut_desc[0]
|
2021-04-19 14:03:29 +00:00
|
|
|
|
else:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc = desc_raw
|
2021-02-14 15:25:39 +00:00
|
|
|
|
except Exception:
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
desc = ''
|
|
|
|
|
return desc
|
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def get_all_wikitext(self):
|
2021-02-14 15:25:39 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
query_string = {'action': 'parse', 'page': self.query_text_name, 'prop': 'wikitext', 'format': 'json'}
|
|
|
|
|
desc_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
|
|
|
|
|
load_desc = await self.get_data(desc_url, 'json', self.headers)
|
|
|
|
|
desc = load_desc['parse']['wikitext']['*']
|
2020-10-27 15:48:41 +00:00
|
|
|
|
except Exception:
|
2021-02-01 15:13:11 +00:00
|
|
|
|
traceback.print_exc()
|
2020-10-27 15:48:41 +00:00
|
|
|
|
desc = ''
|
|
|
|
|
return desc
|
|
|
|
|
|
|
|
|
|
async def step1(self):
|
2021-02-02 10:01:46 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
self.page_id = self.parse_page_id(self.page_raw)
|
2021-02-02 10:01:46 +00:00
|
|
|
|
except:
|
|
|
|
|
return {'status': 'done', 'text': '发生错误:无法获取到页面,请检查是否设置了对应Interwiki。'}
|
2021-04-20 16:32:53 +00:00
|
|
|
|
self.psepgraw = self.page_raw['query']['pages'][self.page_id]
|
2020-10-27 15:48:41 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if self.page_id == '-1':
|
2021-04-18 12:25:25 +00:00
|
|
|
|
if self.template == True:
|
2021-08-24 13:21:04 +00:00
|
|
|
|
self.page_name = self.orginpagename = re.sub(r'^Template:', '', self.page_name)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
self.template = False
|
2021-06-19 03:34:57 +00:00
|
|
|
|
if self.interwiki == '':
|
|
|
|
|
target = ''
|
|
|
|
|
else:
|
|
|
|
|
target = self.interwiki + ':'
|
2021-08-24 13:21:04 +00:00
|
|
|
|
self.template_prompt = f'提示:[{target}Template:{self.page_name}]不存在,已自动回滚搜索页面。\n'
|
2021-04-18 12:25:25 +00:00
|
|
|
|
return await self.step1()
|
2021-04-20 16:32:53 +00:00
|
|
|
|
return await self.page_not_found()
|
2020-06-13 12:43:43 +00:00
|
|
|
|
else:
|
2020-10-27 15:48:41 +00:00
|
|
|
|
return await self.step2()
|
|
|
|
|
|
|
|
|
|
async def step2(self):
|
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
full_url = self.psepgraw['fullurl']
|
|
|
|
|
try:
|
|
|
|
|
geturl_pagename = full_url.split(self.wiki_articlepath)
|
|
|
|
|
geturl_pagename = geturl_pagename[1]
|
|
|
|
|
except:
|
|
|
|
|
geturl_pagename = full_url
|
|
|
|
|
self.query_text_name = urllib.parse.unquote(geturl_pagename)
|
|
|
|
|
query_text_name_split = self.query_text_name.split(':')
|
|
|
|
|
if len(query_text_name_split) > 1:
|
2021-02-14 16:34:04 +00:00
|
|
|
|
namespaces = await self.get_namespace()
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if query_text_name_split[0] in namespaces:
|
2021-08-24 13:21:04 +00:00
|
|
|
|
if namespaces[query_text_name_split[0]] == 'Template':
|
2021-04-20 16:32:53 +00:00
|
|
|
|
get_all_text = await self.get_all_wikitext()
|
2021-02-14 16:34:04 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
match_doc = re.match(r'.*{{documentation\|?(.*?)}}.*', get_all_text, re.I | re.S)
|
|
|
|
|
match_link = re.match(r'link=(.*)', match_doc.group(1), re.I | re.S)
|
|
|
|
|
if match_link:
|
|
|
|
|
get_doc = match_link.group(1)
|
|
|
|
|
get_doc_raw = await self.get_page_link(get_doc)
|
|
|
|
|
get_doc_id = self.parse_page_id(get_doc_raw)
|
|
|
|
|
get_doc_link = get_doc_raw['query']['pages'][get_doc_id]['fullurl']
|
|
|
|
|
get_doc_pagename = get_doc_link.split(self.wiki_articlepath)[1]
|
|
|
|
|
self.query_text_name = get_doc_pagename
|
2021-02-14 16:34:04 +00:00
|
|
|
|
else:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
self.query_text_name = geturl_pagename + '/doc'
|
2021-02-14 16:34:04 +00:00
|
|
|
|
except AttributeError:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
self.query_text_name = geturl_pagename + '/doc'
|
2021-05-05 04:04:01 +00:00
|
|
|
|
enabled_extensions = await self.get_enabled_extensions()
|
|
|
|
|
if 'TextExtracts' in enabled_extensions:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc = await self.get_desc()
|
2021-04-18 14:42:44 +00:00
|
|
|
|
else:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
desc = await self.get_first_line()
|
2021-02-14 15:25:39 +00:00
|
|
|
|
print(desc)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
fin_page_name = geturl_pagename
|
2021-02-14 15:25:39 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
section = re.match(r'.*(\#.*)', self.page_name)
|
2021-02-18 11:44:00 +00:00
|
|
|
|
if section:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
fin_page_name = geturl_pagename + urllib.parse.quote(section.group(1).encode('UTF-8'))
|
|
|
|
|
full_url = self.psepgraw['fullurl'] + urllib.parse.quote(section.group(1).encode('UTF-8'))
|
2021-02-14 15:25:39 +00:00
|
|
|
|
except Exception:
|
2021-03-21 05:00:17 +00:00
|
|
|
|
traceback.print_exc()
|
2021-02-18 11:44:00 +00:00
|
|
|
|
try:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
pgstr = re.match(r'.*(\?.*)', self.page_name)
|
|
|
|
|
if pgstr:
|
|
|
|
|
fin_page_name = geturl_pagename + pgstr.group(1)
|
|
|
|
|
full_url = full_url + pgstr.group(1)
|
2021-02-18 11:44:00 +00:00
|
|
|
|
except Exception:
|
2021-03-21 05:00:17 +00:00
|
|
|
|
traceback.print_exc()
|
2021-04-20 16:32:53 +00:00
|
|
|
|
fin_page_name = urllib.parse.unquote(fin_page_name)
|
|
|
|
|
fin_page_name = re.sub('_', ' ', fin_page_name)
|
|
|
|
|
if fin_page_name == self.orginpagename:
|
2021-02-14 15:25:39 +00:00
|
|
|
|
rmlstlb = re.sub('\n$', '', desc)
|
|
|
|
|
else:
|
|
|
|
|
if self.interwiki == '':
|
|
|
|
|
target = ''
|
|
|
|
|
else:
|
|
|
|
|
target = f'{self.interwiki}:'
|
|
|
|
|
rmlstlb = re.sub('\n$', '',
|
2021-04-20 16:32:53 +00:00
|
|
|
|
f'(重定向[{target}{self.orginpagename}] -> [{target}{fin_page_name}])' + (
|
2021-02-18 11:44:00 +00:00
|
|
|
|
'\n' if desc != '' else '') + f'{desc}')
|
2021-02-14 15:25:39 +00:00
|
|
|
|
rmlstlb = re.sub('\n\n', '\n', rmlstlb)
|
|
|
|
|
if len(rmlstlb) > 250:
|
2021-04-18 12:25:25 +00:00
|
|
|
|
rmlstlb = rmlstlb[0:250] + '...'
|
2021-02-14 15:25:39 +00:00
|
|
|
|
try:
|
|
|
|
|
rm5lline = re.findall(r'.*\n.*\n.*\n.*\n.*\n', rmlstlb)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
result = rm5lline[0] + '...'
|
2021-02-14 15:25:39 +00:00
|
|
|
|
except Exception:
|
|
|
|
|
result = rmlstlb
|
2021-04-20 16:32:53 +00:00
|
|
|
|
msgs = {'status': 'done', 'url': full_url, 'text': result, 'apilink': self.wiki_api_endpoint}
|
|
|
|
|
match_img = re.match(r'File:.*?\.(?:png|gif|jpg|jpeg|webp|bmp|ico)', self.page_name, re.I)
|
|
|
|
|
if match_img:
|
|
|
|
|
getimg = await self.get_image(self.page_name)
|
2021-02-14 15:25:39 +00:00
|
|
|
|
if getimg:
|
|
|
|
|
msgs['net_image'] = getimg
|
2021-04-20 16:32:53 +00:00
|
|
|
|
match_aud = re.match(r'File:.*?\.(?:oga|ogg|flac|mp3|wav)', self.page_name, re.I)
|
|
|
|
|
if match_aud:
|
|
|
|
|
getaud = await self.get_image(self.page_name)
|
2021-04-05 15:04:42 +00:00
|
|
|
|
if getaud:
|
|
|
|
|
msgs['net_audio'] = getaud
|
2021-03-21 05:00:17 +00:00
|
|
|
|
if result != '' and await self.danger_text_check(result):
|
2021-02-14 15:25:39 +00:00
|
|
|
|
return {'status': 'done', 'text': 'https://wdf.ink/6OUp'}
|
|
|
|
|
return msgs
|
|
|
|
|
except Exception as e:
|
|
|
|
|
traceback.print_exc()
|
2021-08-24 06:49:35 +00:00
|
|
|
|
return {'status': 'done', 'text': '发生错误:' + str(e) + '\n错误汇报地址:https://github.com/Teahouse-Studios/bot/issues/new?assignees=OasisAkari&labels=bug&template=5678.md&title='}
|
2021-02-01 15:13:11 +00:00
|
|
|
|
|
2021-04-20 16:32:53 +00:00
|
|
|
|
async def main(self, api_endpoint_link, page_name, interwiki=None, template=False, headers=None, tryiw=0):
|
2021-04-19 17:00:54 +00:00
|
|
|
|
print(api_endpoint_link)
|
2021-04-20 16:32:53 +00:00
|
|
|
|
print(page_name)
|
2020-10-27 15:48:41 +00:00
|
|
|
|
print(interwiki)
|
2021-04-28 11:11:58 +00:00
|
|
|
|
try:
|
2021-04-28 11:14:52 +00:00
|
|
|
|
if page_name == '':
|
|
|
|
|
article_path = await self.get_article_path(api_endpoint_link)
|
|
|
|
|
if not article_path:
|
|
|
|
|
article_path = '发生错误:此站点或许不是有效的Mediawiki网站。' + api_endpoint_link
|
|
|
|
|
return {'status': 'done', 'text': article_path}
|
|
|
|
|
page_name = re.sub('_', ' ', page_name)
|
|
|
|
|
page_name = page_name.split('|')[0]
|
|
|
|
|
self.wiki_api_endpoint = api_endpoint_link
|
|
|
|
|
danger_check = self.danger_wiki_check()
|
|
|
|
|
if danger_check:
|
|
|
|
|
if await self.danger_text_check(page_name):
|
|
|
|
|
return {'status': 'done', 'text': 'https://wdf.ink/6OUp'}
|
|
|
|
|
self.orginpagename = page_name
|
|
|
|
|
self.page_name = page_name
|
|
|
|
|
if interwiki == None:
|
|
|
|
|
self.interwiki = ''
|
|
|
|
|
else:
|
|
|
|
|
self.interwiki = interwiki
|
|
|
|
|
self.wiki_info = await self.get_wiki_info()
|
|
|
|
|
self.wiki_namespace = await self.get_namespace()
|
|
|
|
|
real_wiki_url = await self.get_real_address()
|
|
|
|
|
api_endpoint = re.match(r'^https?://.*?/(.*)', api_endpoint_link)
|
|
|
|
|
self.wiki_api_endpoint = real_wiki_url + '/' + api_endpoint.group(1)
|
|
|
|
|
self.wiki_articlepath = await self.get_article_path()
|
|
|
|
|
self.template = template
|
|
|
|
|
self.template_prompt = None
|
|
|
|
|
self.headers = headers
|
|
|
|
|
if self.template:
|
2021-08-24 13:20:29 +00:00
|
|
|
|
if not re.match('^Template:', self.page_name, re.I):
|
|
|
|
|
self.page_name = 'Template:' + self.page_name
|
2021-04-28 11:11:58 +00:00
|
|
|
|
self.page_raw = await self.get_page_link()
|
|
|
|
|
except asyncio.exceptions.TimeoutError:
|
2021-08-24 06:49:35 +00:00
|
|
|
|
return {'status': 'done', 'text': '发生错误:请求页面超时。\n错误汇报地址:https://github.com/Teahouse-Studios/bot/issues/new?assignees=OasisAkari&labels=bug&template=5678.md&title='}
|
2021-04-28 11:11:58 +00:00
|
|
|
|
except Exception as e:
|
2021-08-24 06:49:35 +00:00
|
|
|
|
return {'status': 'done', 'text': f'发生错误:{str(e)}\n错误汇报地址:https://github.com/Teahouse-Studios/bot/issues/new?assignees=OasisAkari&labels=bug&template=5678.md&title='}
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if 'interwiki' in self.page_raw['query']:
|
|
|
|
|
iwp = self.page_raw['query']['interwiki'][0]
|
|
|
|
|
match_interwiki = re.match(r'^' + iwp['iw'] + r':(.*)', iwp['title'])
|
2021-04-18 12:25:25 +00:00
|
|
|
|
if tryiw <= 5:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
iw_list = await self.get_interwiki(self.wiki_api_endpoint)
|
|
|
|
|
interwiki_link = iw_list[iwp['iw']]
|
|
|
|
|
check = await self.check_wiki_available(interwiki_link)
|
2021-04-18 15:30:58 +00:00
|
|
|
|
if check[0]:
|
2021-04-20 16:32:53 +00:00
|
|
|
|
return await self.main(check[0], match_interwiki.group(1),
|
|
|
|
|
((interwiki + ':') if interwiki is not None else '') + iwp['iw'],
|
|
|
|
|
self.template, headers, tryiw + 1)
|
2021-04-18 12:25:25 +00:00
|
|
|
|
else:
|
|
|
|
|
return {'status': 'done',
|
2021-04-20 16:32:53 +00:00
|
|
|
|
'text': f'发生错误:指向的interwiki或许不是一个有效的MediaWiki。{interwiki_link}{match_interwiki.group(1)}'}
|
2021-04-18 12:25:25 +00:00
|
|
|
|
else:
|
|
|
|
|
return {'status': 'warn', 'text': '警告:尝试重定向已超过5次,继续尝试将有可能导致你被机器人加入黑名单。'}
|
2021-04-20 16:32:53 +00:00
|
|
|
|
if 'redirects' in self.page_raw['query']:
|
|
|
|
|
self.page_name = self.page_raw['query']['redirects'][0]['to']
|
2021-04-18 12:25:25 +00:00
|
|
|
|
try:
|
2020-10-27 15:48:41 +00:00
|
|
|
|
return await self.step1()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
traceback.print_exc()
|
2021-08-24 06:49:35 +00:00
|
|
|
|
return f'发生错误:{str(e)}' + '\n错误汇报地址:https://github.com/Teahouse-Studios/bot/issues/new?assignees=OasisAkari&labels=bug&template=5678.md&title=\n'
|