一.导入需要的包
import urllib.request
import re
import requests
import time
import json
import xlwt
from random import choice
import os
import socket
from lxml import etree
二.项目需求分析
1.打开天猫首页,搜索手机,出现下面页面
2.查看网页源代码,搜索苹果,跳转到如图地方,
3.点击前面的链接发现是每种品牌的手机页面,这些链接也是我们要抓的第一类信息
4.在这个页面,我们可以看见的信息有:商品LOGO、商品名、销量、价格等,肯定还有跳转到商详页的链接,这些都是我们接下来要抓取的东西
同理,打开源码页,这里有个小技巧,可以搜索销量,这样定位更准确些,如上图,在销量信息周围我们能找到其他需要的信息,得到商详页的链接后,我们可以进商详页看看有什么可以爬取的了
5.个人觉得商详页最有价值的东西就是评论信息及评论图片了,所以,F12打开调试器,点击network,找到存放评论信息的链接,如下:
https://rate.tmall.com/list_detail_rate.htm?itemId=538869984042&spuId=382573494&sellerId=2616970884&order=3&
currentPage=1&append=0
&content=1&tagId=&posi=&picture=1&groupId=&ua=098%23E1hvwQvpvLhvUvCkvvvvvjiPR2z90jtURF
qZAjEUPmPO6jDvR2cZ6j1PPLSyAjnhRphvCvvvphmCvpvuARNjNjx4zYMNQ9FwibDug5%2B2195RQDIFG5K88d
%2Fb9cD3e48myECi97DtdphvmpmvGSX2vvmWbIwCvvpv9hCviQhvCvvvpZpCvpvVvUCvpvvvmphvLU2LJ5Ia%2
Bb8reEQaUExreCkKHkx%2F1WmK53hz8Z4ikC4AdX3l8PoxdX9OdegaQfV6R3pBOyKQD40OV8tYVVzheugcRoxL
Dwet%2B3oZfveEvpvVvpCmpYFyuphvmvvvpoNq4cp3Kphv8vvvphvvvvvvvvC2DpvvvJyvvhXVvvmCWvvvByOv
vUhwvvCVB9vv9BQPvpvhvv2MMsyCvvpvvhCv9phv2nM5WDQi7rMNzT2Qz2yCvvpvvhCvdphvmpmC6rN0vvvPR8
6Cvvyv98o3L9vvbbG%3D&itemPropertyId=&itemPropertyIndex=&userPropertyId=&
userPropertyIndex=&rateQuery=&location=&needFold=0&_ksTS=1541865686575_944
&callback=jsonp945
确实,很长的一段,但分析下,其实不用全部的,只需要有几个关键的信息点就行了,笔者亲测,可以缩减至如下
https://rate.tmall.com/list_detail_rate.htm?itemId=538869984042&sellerId=2616970884&&order=3¤tPage=1&append=0&content=1&picture=1
其中我们需要修改的地方有:itemId、sellerId、currentPage三处,分别为商品ID,商家ID和当前页号
6.我们先打开上面的链接,看看他其实长这样子
没错,一个JSON格式的文件,至于为什么我的页面这么好看,因为我用来谷歌的一个插件,叫json-handle
好了,一目了然了,评论内容、评论时间、追评内容、追评时间、评论图片都在这里了,接下来就是代码实现了
二.代码实现
1.随机获取请求头信息
def get_user_hearder():
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ',
'Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
]
headers = ['User-Agent']
headers.append(choice(user_agents))
return headers
2.使用代理获取网页源码
def get_html_proxy_utf(url,lines):
proxy=urllib.request.ProxyHandler({'http':choice(lines)})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
opener.addheaders = [get_user_hearder()]
html=opener.open(url).read().decode('utf-8','ignore')
return html
3.获取json文件
def getdata(html):
jsondata = re.search(r'\{.*\}', html).group()
data = json.loads(jsondata)
return data
4.获取需要的信息、下载图片、写入excel文件
def download(logo_name, item_name,itemId,sale_num,data,url,z,f,sheet1):
path = 'E:\TMALL\%s\%s\\' % (str(logo_name), str(item_name).replace('/', ''))
if not os.path.exists(path):
os.makedirs(path)
for i in range(0, len(data['rateDetail']['rateList'])):
content = data['rateDetail']['rateList'][i]['rateContent']
# 评论
creationtime = data['rateDetail']['rateList'][i]['rateDate']
#评论时间
if ('pics' in data['rateDetail']['rateList'][i].keys()):
if (data['rateDetail']['rateList'][i]['pics'] != ''):
for k in range(0, len(data['rateDetail']['rateList'][i]['pics'])):
pics = data['rateDetail']['rateList'][i]['pics'][k]
a = 'http:' + pics
auto_down(a, path + str(z) + '_' +str(i + 1) + '_' + 'pic' + '_' + str(k) + '.jpg')
# 买家秀,命名规则:第几页第几条评论的第几个图片
if ( data['rateDetail']['rateList'][i]['appendComment'] == None): # 判断是否有追评,如果没有将追评的信息默认为空
appendcontent = ''
appendtime = ''
else:
appendcontent = data['rateDetail']['rateList'][i]['appendComment']['content']
# 追加评论的内容
appendtime = data['rateDetail']['rateList'][i]['appendComment']['commentTime']
if (data['rateDetail']['rateList'][i]['appendComment']['pics'] != ''): # 判断是否有追加的图片
for l in range(0, len(data['rateDetail']['rateList'][i]['appendComment']['pics'])):
appendpics = data['rateDetail']['rateList'][i]['appendComment']['pics'][l]
b = 'http:' + appendpics
html2 = requests.get(b)
respon2 = html2.status_code
if (respon2 != 200):
continue
else:
auto_down(b, path + str(z) + '_' +str(i + 1) + '_' + 'appendpic' + '_' + str(l) + '.jpg')
# 下载追加评论的图片 命名规则:第几页第几条评论的追加评论中的第几个图片
write_to_excel(logo_name, item_name,itemId,sale_num,content, creationtime, appendcontent, appendtime, url, z * 10 + i,f,sheet1)
print(str(i) + "页数据已经保存")
5.主函数
def main():
f = open('valid_ip.txt', 'rb')
lines = f.readlines()
url1 = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.4.38322a68rQnDFe&' \
'q=%CA%D6%BB%FA&sort=d&style=g&search_condition=2&sarea_code=310100&from=sn_1_brand-qp&shopType=any#J_Filter'
try:
pagecode=get_html_proxy_gbk(url1,lines)
except Exception as e:
pagecode=get_html_proxy_gbk(url1,lines)
html = etree.HTML(pagecode)
logo_url = html.xpath('//a[@data-f="spu-brand-qp"]/@href')
logo_name = html.xpath('//a[@data-f="spu-brand-qp"]/img/@alt')
for k in range(7,len(logo_url)):
url = 'https://list.tmall.com/search_product.htm' + logo_url[k]
try:
pagecode = get_html_proxy_gbk(url,lines)
except Exception as e:
pagecode = get_html_proxy_gbk(url,lines)
html = etree.HTML(pagecode)
url_list = html.xpath('//div[@class="productTitle productTitle-spu"]/a[1]/@href')
item_name = html.xpath('//div[@class="productTitle productTitle-spu"]/a[1]/text()')
sale_num = html.xpath('//p[@class="productStatus"]/span/em/text()')
for i in range(0,len(url_list)):
itemId=re.findall(r'id=(.*?)&', url_list[i])[0]
sellerId=re.findall(r'id=(.*?)&', url_list[i])[1]
if os.path.exists('%s.xls' % str(item_name[i]).replace('/', '')):
continue
else:
f = xlwt.Workbook()
sheet1 = f.add_sheet('%s' % (str(itemId)), cell_overwrite_ok=True)
row0 = ["品牌", "商品名称", "商品编号", "销量", "评论文本", '评论时间', '追评文本', '追评时间', 'URL']
# 写入第一行
for q in range(0, len(row0)):
sheet1.write(0, q, row0[q])
print('开始爬取第' + str(i) + '个商品_' + str(item_name[i]))
for j in range(1,100):
url='https://rate.tmall.com/list_detail_rate.htm?itemId='+str(itemId)+\
'&sellerId='+str(sellerId)+'&order=3¤tPage='+str(j)+'&append=0&content=1&picture=1'
try:
data = getdata(get_html_proxy_utf(url,lines))
except Exception as e:
try:
data = getdata(get_html_proxy_utf(url, lines))
except Exception as e:
data = getdata(get_html_proxy_utf(url, lines))
download(logo_name[k], item_name[i],itemId,sale_num[k],data,url,j-1,f,sheet1)
print('第'+str(j)+'页爬取完成')
time.sleep(1)
f.close()
OVER
全部的代码都在上面了,希望对大家的学习有帮助!