爬取网站表情包
注意:爬取的时候看一下我的图片地址是否还可用,因为当我上午和下午一直在爬取,结果晚上图片下载地址就有问题了。我也不知道为啥。结果改一下就好了。
直接上代码:
import re
import uuid # 这里使用uuid我是为了给图片命名
import requests
import parsel # 第三方模块 pip install parsel
import time
def change_title(title):
# complie编译
mode = re.compile(r'[\/\\\:\*\?\"\<\>\|]')
# sub替换
new_title = re.sub(mode,'_',title)
return new_title
time1 = time.time()
for page in range(1,101):
url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{page}.html'
# 请求头,把python代码伪装成浏览器,对服务器发起请求,服务器接收到以后会给我们返回一个response对象
# User-Agent是自己的浏览器的User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41',
}
# 获取响应对象
response = requests.get(url=url, headers=headers)
# 请求的数据是和Element是不一样的
# print(response.text)
# 需要把获取到的response.text HTML字符串文本数据 把它转换成selector对象
selector = parsel.Selector(response.text)
# 根据标签的属性内容提取数据
# attr()属性选择器
# getall()取所有,返回的是一个列表
titles = selector.css('.ui div.tagbqppdiv a::attr(title)').getall()
# 这里的选择器有可能会改变,根据自己的页面
imges = selector.css('.ui div.tagbqppdiv a sourceimages::attr(data-original)').getall()
# print(titles)
# print(imges)
for index in zip(titles,imges):
title = index[0]
img_url = index[1]
new_title = change_title(title)
# 确定文件名的后缀
imagename = img_url.split('.')[-1]
if len(new_title) > 200:
new_title = new_title[:10]
# 二进制数据
imgcontent = requests.get(url=img_url,headers=headers).content
with open('sourceimages\\A'+str(uuid.uuid4())[-12:]+'.jpg',mode='wb') as file:
file.write(imgcontent)
print('正在保存'+title)
else:
imgcontent = requests.get(url=img_url, headers=headers).content
with open('sourceimages\\A'+str(uuid.uuid4())[-12:]+'.jpg',mode='wb') as file:
file.write(imgcontent)
print('正在保存' + title)
# print(imagename)
# print(index)
time2 = time.time()
usetime = int(time2)-int(time1)
print(usetime)
改变的位置:
上面的是单线程爬取,下面实现多线程:
import re
import uuid
import requests
import parsel # 第三方模块 pip install parsel
import time
import concurrent.futures
time_1 = time.time()
def change_title(title):
# complie编译
mode = re.compile(r'[\/\\\:\*\?\"\<\>\|]')
# sub替换
new_title = re.sub(mode,'_',title)
return new_title
def get_response(html_url):
"""发送请求"""
# 请求头,把python代码伪装成浏览器,对服务器发起请求,服务器接收到以后会给我们返回一个response对象
headers = {
'Cookie':'__gads=ID=3ca13a48b6bff448-22fb34f730c900a6:T=1622859030:RT=1622859030:S=ALNI_MZs1lrcf9UfFd-ZRwvhmD1T64Pk4A; UM_distinctid=179d9efa788149-0dee70f28f65b7-51361244-100200-179d9efa78b250; PHPSESSID=ou4kj27av969s6qa024d1d678k; BAIDU_SSP_lcr=https://cn.bing.com/; CNZZDATA1260546685=783497468-1622857221-https%253A%252F%252Fcn.bing.com%252F%7C1622905194',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41',
}
response = requests.get(url=html_url, headers=headers)
return response
def save(title,name,img_url):
"""保存数据"""
imgcontent = get_response(img_url).content
with open('sourceimages\\A'+str(uuid.uuid4())[-12:]+'.jpg',mode='wb') as file:
file.write(imgcontent)
print('正在保存' + title)
def get_image_info(html_url):
selector = parsel.Selector(get_response(html_url).text)
titles = selector.css('.ui div.tagbqppdiv a::attr(title)').getall()
imges = selector.css('.ui div.tagbqppdiv a img::attr(data-original)').getall()
# print(titles)
# print(imges)
zip_data = zip(titles,imges)
return zip_data
def main(html_url):
zip_data = get_image_info(html_url)
for index in zip_data:
title = index[0]
img_url = index[1]
new_title = change_title(title)
# 确定文件名的后缀
imagename = img_url.split('.')[-1]
if len(new_title) > 6:
new_title = new_title[:4]
save(new_title,imagename,img_url)
else:
save(new_title, imagename, img_url)
if __name__ == '__main__':
exe = concurrent.futures.ThreadPoolExecutor(max_workers=3)
for page in range(1, 201):
url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{page}.html'
exe.submit(url,main(url))
exe.shutdown()
time_2 = time.time()
use_time = int(time_2)-int(time_1)
print(use_time)