下厨房某词条下的所有图片爬取
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:小二郎
# datetime:2021/4/22 10:38
# software: PyCharm
import requests
from lxml import etree
import time
import os
import re
class xiachufangsgspider:
def __init__(self):
self.url = 'https://www.xiachufang.com/search/?cat=1007&keyword={}&cat=1001&page={}'
self.headers = { # 请求头文件
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Upgrade-Insecure-Requests': '1',
'Host': 'www.xiachufang.com',
'Cookie': 'bid=kQ82GDsK; gr_user_id=b6c4ae54-136a-45f2-a497-ad9131dff47a; __utma=177678124.392935892.1511776434.1511776434.1511776434.1; __utmz=177678124.1511776436.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_ecd4feb5c351cc02583045a5813b5142=1511776486,1511942546; gr_session_id_8187ff886f0929da=595c7891-62df-44e1-a71a-ba6f482a200b; Hm_lpvt_ecd4feb5c351cc02583045a5813b5142=1511942570',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
}
self.headers1 = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# 'Connection': 'keep-alive',
# 'Cookie': 'gr_user_id=b6c4ae54-136a-45f2-a497-ad9131dff47a; __utma=177678124.392935892.1511776434.1511776434.1511776434.1; __utmz=177678124.1511776436.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_ecd4feb5c351cc02583045a5813b5142=1511776486,1511942546; Hm_lpvt_ecd4feb5c351cc02583045a5813b5142=1511943055',
# 'Host': 's2.cdn.xiachufang.com',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'
}
def get_link(self):
print('规范格式为,例如:D:/mytest/', '----没做目录校验,一定要正确填写目录')
file_dir = input('请输入文件保存地址:')
dt = time.strftime("%Y-%m-%d", time.localtime())
# 打印文件地址,以便于做目录跳转
directory = file_dir + dt
if not os.path.exists(directory):
print('文件夹不存在,现已经创建~~~')
os.makedirs(directory, exist_ok=True)
else:
print('~~~文件夹地址已经存在,无需创建')
input_text = input('请输入要搜索的标签:')
# 创建标签文件夹:
directory_input_text = file_dir + dt + '/' + input_text
if not os.path.exists(file_dir + dt + '/' + input_text):
print('文件夹不存在,现已经创建~~~')
os.makedirs(file_dir + dt + '/' + input_text, exist_ok=True)
else:
print('~~~文件夹地址已经存在,无需创建')
num = int(input('请输入需要的页数:'))
for i in range(num):
num = i + 1
directory_input_text_num = directory_input_text + '/' + '第' + str(num) + '页'
if not os.path.exists(directory_input_text_num):
os.makedirs(directory_input_text_num, exist_ok=True)
url = self.url.format(input_text, num)
print(url)
page_text = requests.get(url=url, headers=self.headers).text
tree = etree.HTML(page_text)
# 每一条信息都存在li标签中,先获取li标签对象
li_list = tree.xpath('//ul[@class="list"]/li')
# print(li_list)
# 便利li标签,获取标题和链接地址
os.chdir(directory_input_text_num)
# print(os.getcwd())
# m = False
for li in li_list:
title = li.xpath('normalize-space(./div/div/p[1]/a/text())')
# 使用正则过滤特殊符号
rstr = '[/\\\:*?<>|@]'
title1 = re.sub(rstr, '_', str(title))
if '视频' not in title1:
# title1 = re.findall(r'[^\*"/:?\\|<>]', title, re.S)
ws = directory_input_text_num + '/' + str(title1)
if not os.path.exists(ws):
os.makedirs(ws, exist_ok=True)
link = 'https://www.xiachufang.com/' + li.xpath('./div/div/p[1]/a/@href')[0]
print(title, ' ', link + '\n')
link_page = requests.get(url=link, headers=self.headers).text
link_page_tree = etree.HTML(link_page)
# print(link_page_tree)
try:
# 大图链接
link_page_list = link_page_tree.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div/div[1]/img/@src')[0]
# print('我是图片地址:' + link_page_list)
except Exception as e:
print(e)
link_page_list = link_page_tree.xpath('//div[@class="block recipe-show"]/div[2]/img/@src')[0]
img_data = requests.get(url=link_page_list, headers=self.headers1).content
# 大图的名字
# big_img_name = str(link_page_tree.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div/div[1]/img/@alt')[0])
big_img_name1 = '大图.jpg'
big_img_path = ws + '/' + big_img_name1
with open(big_img_path, 'wb') as fp:
fp.write(img_data)
print(big_img_name1, '下载成功')
# 步骤图链接
link_page_list_li = link_page_tree.xpath('//div[@class="steps"]/ol/li')
# print(link_page_list_li)
# if len(link_page_list_li):
for ol_li in link_page_list_li:
if len(ol_li.xpath('./img/@src')):
ol_li_link = ol_li.xpath('./img/@src')[0]
# print(ol_li_link)
small_img_name = ol_li.xpath('./img/@alt')[0].replace('\n', '').replace('\r',
'') + '.jpg'
title2 = re.sub(rstr, '_', str(small_img_name))
mini_img_data = requests.get(url=ol_li_link, headers=self.headers1).content
small_img_path = ws + '/' + title2
with open(small_img_path, 'wb') as fp:
fp.write(mini_img_data)
print(title2, '下载成功')
# time.sleep(0.2)
if __name__ == '__main__':
m = xiachufangsgspider()
m.get_link()
ps:个人使用代码,写的不是很规整。