Archived
1
0
Fork 0

use beautifulsoup to parse page when no textextract extension

This commit is contained in:
yzhh 2021-06-01 20:43:31 +08:00
parent 6c3150a4ba
commit ee1a7fb5c5

View file

@ -7,6 +7,7 @@ import urllib.parse
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from core import dirty_check
from .database import WikiDB
@ -303,11 +304,11 @@ class wikilib:
async def get_first_line(self):
try:
query_string = {'action': 'parse', 'page': self.query_text_name, 'prop': 'wikitext', 'section': '0',
query_string = {'action': 'parse', 'page': self.query_text_name, 'prop': 'text',
'format': 'json'}
desc_url = self.wiki_api_endpoint + self.encode_query_string(query_string)
load_desc = await self.get_data(desc_url, 'json', self.headers)
desc_raw = load_desc['parse']['wikitext']['*'].split('\n')
desc_raw = BeautifulSoup(load_desc['parse']['text']['*'], 'html.parser').get_text(separator='\n').split('\n')
desc_list = []
for x in desc_raw:
if x != '':