简单介绍一下背景,朋友的亲戚是开家具厂的,能低价拿到厂里的货,想开网店,需要一些商品的图片.
据朋友所说,有一个网店和他是一样的货源,于是对其网店商品图片进行爬取.在这里记录一下过程,也是想看看自己能不能做到一件事.
涉及的知识有:
-
python
-
基于request的网络爬虫
-
基于selenium的网络爬虫
-
正则表达式的使用
将任务简单划分了一下
首先要根据搜索结果获取第一页商品详情页并保存
搜索结果页面处理.py
from Functiontest import *
#ua伪装
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58'
,'Cookie': 'xxxx自己的cookie'
,'Referer':'https://s.taobao.com'
}
#指定URL地址
url='搜索页面的URL'
page_code = PageCodegtter(url,headers)
#网页持久化
with open("搜索商品信息.html",'w',encoding="utf-8") as fp:
print("网页保存成功")
fp.write(page_code)
#解析网页源码,拿到商品名称和对应的详情入口连接,也许只需要详情页面链接
#aim = '{"i2iTags":.*"title":"(.*)".*"detail_url":"(.*)".*,"shopName":"XXXXX店铺名"}'
aim = '{"i2iTags":.*"detail_url":"(.*)".*,"shopName":"XXXXX店铺名"}'
#使用正则表达式解析源码
#addre = "搜索商品信息.html"
#预处理 方便正则匹配 .*? 即使是非贪婪模式,也匹配了过多的的字符,所以处理了网页 可以根据需要使用使用 (\S*) \S 表示所有非空字符
page_code = page_code.replace(',{"i2iTags"','\n,{"i2iTags"')
aim = '{"i2iTags":.*"detail_url":"(.*)?","view_price.*,"shopName":"XXXXX店铺名"}'
det_list = CodeParsebyRe(aim,page_code)
#规格化并解码 编码中含有unicode 编码转换为UTF-8
det_list = ["https:"+ i if 'https:' not in i else i for i in det_list]
'''
对 i.encode('utf8').decode('unicode_escape')
我的理解是
由于现在是以str类型存储了一个以utf-8编码格式存储了unicode格式的字符串
我们首先以encode('utf-8')将其以utf-8格式解码存储为二进制raw bytes,再对对raw bytes进行
decode('unicode_escape')解unicode编码存储为str
'''
det_list = [ i.encode('utf8').decode('unicode_escape') for i in det_list]
#天猫动态渲染导致基于request的爬虫无法获取具体数据,使用selenium手动扫码进行详情页爬取 保存成功
headers["Cookie"] = '由于该店铺在天猫,所以需要修改Cookie'
DetailPagesKepper(WebPageSeleniumByList(det_list,headers))
在获得的所有目标商品详情页后,对其进行解析,爬取商品
详情页面处理.py
from Functiontest import *
'''
1.获取标题
2.获取视频
3.获取图片
4.获取详情页面
'''
def DetailDeal(max_size):
'''
依次分析网页,并请求和存储数据
参数:一个整数,在前文中保存网页根据保存顺序来命名网页,所以给出最大值索引来依次解析
'''
max = max_size
pagecode = ''
for index in range(0,max+1):
#插旗方便查看
print(index)
#读取网页
pagecode = CodeReader("./店铺商品信息/"+str(index)+'.html')
#解析网页
page_detail = DetailPageParse(pagecode)
page_detail['title']
if not os.path.exists("./店铺商品信息/"+str(page_detail['title'])):
os.mkdir("./店铺商品信息/"+str(page_detail['title']))
#在编写代码之前,根据观察,所有的商品都有图片介绍,和详情图片,视频是不必须的
#判断视频地址是否为空
if page_detail["prevideo_url"] :
#给出视频地址
print(page_detail["prevideo_url"])
#视频应该只有一个
if VideorRequest(page_detail["prevideo_url"][0],str(page_detail['title'])+"/prevideo") :
print("视频下载成功")
else :
print("当前商品无视频")
#给出介绍图片地址
print(page_detail["prepicture_url"])
if PictureRequest(page_detail["prepicture_url"],str(page_detail['title'])+"/prepicture") :
print("预览下载成功")
#给出详情图片介绍
print(page_detail["detailpicture_url"])
if PictureRequest(page_detail["detailpicture_url"],str(page_detail['title'])+"/detailpicture") :
print("详情下载成功")
DetailDeal(44)
print("爬取结束")
Functiontest.py(大量蹩脚英语注释警告)
from urllib import response
import requests
import re
import os
from time import sleep
#加载驱动
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
def PageCodegtter (url ,headers):
'''
get the code of page you want
parameter:
urt is a str like "www.baidu.com"
header is a dictionary like {'User-Agent':'XXXX'
,'Cookie':'XXX'
}
return:
the page's code as str
'''
#发送请求
print("开始下载网页\n")
#简单的通过request发送一个get请求
session=requests.Session()
response=session.get(url=url,headers=headers)
#从response中获取源码
page_code=response.text
print("网页下载成功\n")
return page_code
#这里就看以看出对整体的把握不行,突然发现前面对页面源码获取居然写死了response.text,又因为不想函数有太多的参数,写死了请求头headers
def SendRequest(url):
'''
对url 发送请求
'''
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58'
,'Cookie': 'XXXX'
,'Referer':'https://s.taobao.com'
}
response = requests.get(url=url,headers=headers)
return response
def VideorRequest(video_url,title):
'''
请求视频并保存
'''
print("开始下载视频数据")
#请求视频的数据
video_data = SendRequest(video_url).content
#临时存储数据
if not os.path.exists("./店铺商品信息/"+title):
os.mkdir("./店铺商品信息/"+title)
pass
with open("./店铺商品信息/"+title+"/video.mp4","wb") as fp:
fp.write(video_data)
return True;
def CodeParsebyRe (aim,code):
'''
parsing code to take what you want by you aim regular expression
parameter:
aim: a str contain the regular expression
code : an object contain the code
'''
data = re.findall(aim,code);
#插旗检查
# print(data)
# print("\n")
return data
#从网页文件中中取得code
def CodeReader(addre):
'''
read html form a html file
parameter:
str contain the address of the file
'''
#将网页持久化后,读取网页
pagecode=None
with open(addre,"r",encoding="utf-8") as fp :
pagecode = fp.read()
return pagecode
def DetailPageParse (pagecode):
'''
parsing detail page
parameter:
a str contain the page code
return:
a dictionary contain titile ,the url of Preview the video , picture and detail picture
like {title : ''
,prevideo_url : []
,prepicture_url : []
,detailpicture_url : []
}
'''
#对目标内容的正则表达式
aim_titile = '<title>(.*)</title>'
aim_prevideo = 'poster="" src="(.*)"></video>'
#这里有个取巧,部分图片的url我们不能直接获取,但能拿到相关资源比如预览的url,进行比较,也能够拿到
aim_prepicture = 'quality="normal" data-once="true" src="(\S*)_110x10000Q75.jpg_.webp"'
aim_detailpicture = '<img data-src="(\S*)" src="//g.alicdn.com/s.gif" data-name="singleImage" class="descV8-singleImage-image lazyload"'
#获取对应目标内容的url
title = CodeParsebyRe(aim_titile,pagecode)
prevideo_url = CodeParsebyRe(aim_prevideo,pagecode)
prepicture_url = CodeParsebyRe(aim_prepicture,pagecode)
detailpicture_url = CodeParsebyRe(aim_detailpicture,pagecode)
#查看对内容获取的情况
if not title : title = ''
if not prevideo_url : prevideo_url = []
else : prevideo_url = ["https:"+ i if 'https:' not in i else i for i in prevideo_url]
if not prepicture_url : prepicture_url = []
else : prepicture_url = ["https:"+ i if 'https:' not in i else i for i in prepicture_url]
if not detailpicture_url : detailpicture_url = []
else : detailpicture_url = ["https:"+ i if 'https:' not in i else i for i in detailpicture_url]
dictionary = {'title' : title
,'prevideo_url' : prevideo_url
,'prepicture_url' : prepicture_url
,'detailpicture_url' : detailpicture_url
}
return dictionary
def PictureRequest(picture_url_lsit,title):
'''
请求对应的数据并存储
'''
count = 0
if not os.path.exists("./店铺商品信息/"+title):
os.mkdir("./店铺商品信息/"+title)
for picture_url in picture_url_lsit:
picturecontent = SendRequest(picture_url).content
with open("./店铺商品信息/"+title+"/"+str(count)+".jpg","wb") as fp:
fp.write(picturecontent)
count+=1
return True;
def WebPageRequestByList(url_list,headers):
'''
requesr a list of page
header is a dictionary like {'User-Agent':'XXXX'
,'Cookie':'XXX'
}
return:
a list of pagecode
'''
#存储响应页面的列表
response_list = []
for url in url_list:
response_list.append(PageCodegtter(url,headers))
print("共下载了"+str(len(response_list))+"张网页")
return response_list
def DetailPagesKepper(response_list):
namelist = 0
if not os.path.exists("./店铺商品信息") :
os.makedirs("./店铺商品信息")
print("--- new folder... ---")
print("--- OK ---")
else:
print("--- There is this folder! ---")
for i in response_list:
page_code = i
#网页持久化
with open("./店铺商品信息/"+str(namelist) +".html",'w',encoding="utf-8") as fp:
fp.write(page_code)
print(str(namelist)+".html保存成功\n")
namelist+=1
def WebPageSeleniumByList(url_list,headers):
'''
requesr a list of page
header is a dictionary like {'User-Agent':'XXXX'
,'Cookie':'XXX'
}
return:
a list of pagecode
'''
s = Service("./selenium浏览器驱动\Firefox\geckodriver")
bro = webdriver.Firefox(service=s)
response_list = []
# Cookie = headers['Cookie']
# bro.add_cookie(Cookie)
for url in url_list:
#让浏览器发起一个url对应请求
bro.get(url=url)
#休眠保证信息完成加载并被爬取,其实没必要怎么久
sleep(40)
#page_source获取浏览器当前页面的页面源码数据
page_text=bro.page_source
response_list.append(page_text)
print("共下载了"+str(len(response_list))+"张网页")
return response_list
写得很丑,希望能对大家有一点点的参考价值,也有很多需要改进的部分,欢迎大家进行讨论.