仅仅展示单线程的代码,多线程可以自行探索不在过多赘述
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/31 12:02
# @Author : huni
# @File : xxx单函数.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib import parse
import os
if __name__ == '__main__':
m_path = './xxx'
if not os.path.exists(m_path):
os.mkdir(m_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
kw = '陆萱萱'
keyword = parse.quote(kw, encoding='utf-8')
url = f'https://xchina.co/search/keyword-{keyword}.html'
resp = requests.get(url=url,headers=headers).text
tree = etree.HTML(resp)
href_part_list = list(set(tree.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]/div[1]//@href')))
for href_part in href_part_list:
href = 'https://xchina.co/' + href_part
resp1 = requests.get(url=href,headers=headers).text
tree1 = etree.HTML(resp1)
div_list = tree1.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/div')
for div in div_list:
href1 = 'https://xchina.co/' + div.xpath('./a[1]/@href')[0]
resp2 = requests.get(url=href1,headers=headers).text
tree2 = etree.HTML(resp2)
title = tree2.xpath('/html/head/title/text()')[0]
title_path = m_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
page_num = int(tree2.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]//text()')[-4])
for j in range(1,page_num+1):
href2 = href1.replace('.html',f'/{j}.html')
resp3 = requests.get(url=href2,headers=headers).text
tree3 = etree.HTML(resp3)
a_list = tree3.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/a')
for a in a_list:
src = 'https://xchina.co' + a.xpath('./@href')[0]
jpg_data = requests.get(url=src,headers=headers).content
jpg_name = src.split('/')[-1]
jpg_path = title_path + f'/{jpg_name}'
with open(jpg_path,'wb') as fp:
fp.write(jpg_data)
print(jpg_name,'下载完成')
------写在后面:
大家如果觉得小编的代码有用,可以多多关注小编,
同时小编的公众号也开通了,大家可以关注下,后续进行粉丝回馈,大家一起学习python叭
打赏小编点这里哦