效果:
单线程模式:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/30 18:56
# @Author : huni
# @File : 图集谷单函数.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib import parse
import os
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
m_path = './xxx'
if not os.path.exists(m_path):
os.mkdir(m_path)
kw = 'xxx'
keyword = parse.quote(kw, encoding='utf-8')
url = f'https://www.tujigu.com/search/{keyword}'
resp = requests.get(url=url,headers=headers).text
tree = etree.HTML(resp)
li_list = tree.xpath('/html/body/div[2]/div[2]/ul/li')
for li in li_list:
href = li.xpath('./a/@href')[0]
resp1 = requests.get(url=href,headers=headers)
# 处理中文乱码问题
page_text = resp1.text.encode('ISO-8859-1').decode('utf-8')
tree1 = etree.HTML(page_text)
page_num = int(tree1.xpath('//*[@id="pages"]//text()')[-3])
title = tree1.xpath("/html/head/title/text()"