#!/usr/bin/python# coding: utf-8import refrom collections import Counterimport requestsimport timefrom bs4 import BeautifulSoupdef count_zero(text): zero = dict(Counter(text)).get('0', 0) if zero > 1: return zero else: return Falsedef get_normal_title(text): start_index = text[:3] titles = re.split(pattern='\d\d\d', string=text, maxsplit=20, flags=1) titles_rm_quesiton = [item.replace('?', ' ') for item in titles] titles_rm_blank = [t for t in titles_rm_quesiton if t] titles_normal = [] for index, i in enumerate(titles_rm_blank): if 1 == len(str(int(start_index))): t = '00' + str(int(start_index) + index) + i elif str(start_index).startswith('0'): t = '0' + str(int(start_index) + index) + i else: t = str(int(start_index) + index) + i titles_normal.append(t) return titles_normaldef eliminate_question(title): return str(title).replace('\xa0', '')def get_title_url(response): title_url_dict = {} soup = BeautifulSoup(response, 'html.parser') tag_p = soup.find_all('p') for each_p in tag_p: urls = [] text = each_p.get_text() pattern = re.compile('^\d{3}.*$') if pattern.match(text): zero_num = count_zero(text) if zero_num: titles = get_normal_title(text) for each_a in each_p.find_all('a'): url = each_a.get('href') urls.append(url) title_url_tuple = zip([eliminate_question(t) for t in titles], urls) for i in title_url_tuple: title_url_dict.setdefault(i[0], i[1]) else: text = eliminate_question(text) url = each_p.find('a').get('href') title_url_dict.setdefault(text, url) return title_url_dictdef download_content(url, title): response = requests.get(url=url).text with open(title + '.html', 'w', encoding='utf-8') as f: f.write(response)def main(): url_wechat_index = 'https://mp.weixin.qq.com/s/7o8QxGydMTUe4Q7Tz46Diw' response = requests.get(url=url_wechat_index).text title_url_dict = get_title_url(response) for title, url in title_url_dict.items(): time.sleep(5) download_content(url, title)if __name__ == "__main__": main()