个人笔记
while k < 10:
params['page'] = k
enum_url = enum_base + '?' + urlencode(params)
headers['Referer'] = enum_url
content = get(enum_url, headers)
html = etree.HTML(content)
href_list = html.xpath('//div[@class="theSimilar"]/ul/li/a/@href')
title_list = html.xpath('//div[@class="theSimilar"]/ul/li/a/text()')
publish_list = html.xpath('//div[@class="theSimilar"]/ul/li/span/text()')
if len(href_list) == 0:
break
if len(href_list) == len(title_list) == len(publish_list):
for i in range(len(href_list)):
data_info = {}
title = title_list[i]
is_title = re.match(r'\d+年\d+月\d+日云南省新冠肺炎疫情情况|云南省新型冠状病毒[感染的]*肺炎疫情情况', title)
if is_title:
href = urljoin(info_base, href_list[i])
data_info['province'] = '云南'
data_info['title'] = title
data_info['publish_time'] = publish_list[i].replace('[', '').replace(']', '')
data_info['text'] = info_parse(href, headers)
print(data_info)
save(data_info)
time.sleep(1)
else:
print('长度不相等')
break
k += 1
time.sleep(1)