淘宝商品比价定向爬虫
目标:获取淘宝搜索页面信息,提取其中的商品名称和价格
理解:淘宝的搜索接口,翻页处理
import requests
import re
def getHTMLText(url): # 返回html页面内容
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt, html): # ilt结果的列表类型
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
except:
print('')
def printGoodList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format('序号', '价格', '商品名称'))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
if __name__ == "__main__":
goods = '书包'
depth = 2
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44 * i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodList(infoList)
淘宝禁止了爬虫对商品信息的获取,爬取时会自动跳转到登录页面.导致上门这段代码无法正常获取商品信息,目前我还不知道如何解决.
下面附带一个我从网上找的对淘宝上短裙商品信息爬取的代码,这个代码是绕过了登录页面,并且把商品信息存到了pymysql中,非常的方便.但是我还不太清楚他具体是如何实现的,先保存下来日后慢慢研究.
博文链接:https://blog.csdn.net/d1240673769/article/details/74620085
# 爬取taobao商品
import urllib.request
import pymysql
import re
# 打开网页,获取网页内容
def url_open(url):
headers = ("user-agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
return data
# 将数据存入mysql中
def data_Import(sql):
conn = pymysql.connect(host='127.0.0.1', user='test', password='123456', db='python', charset='utf8')
conn.query(sql)
conn.commit()
conn.close()
if __name__ == '__main__':
try:
# 定义要查询的商品关键词
keywd = "短裙"
keywords = urllib.request.quote(keywd)
# 定义要爬取的页数
num = 100
for i in range(num):
url = "https://s.taobao.com/search?q=" + keywords + "&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s=" + str(
i * 44)
data = url_open(url)
# 定义各个字段正则匹配规则
img_pat = '"pic_url":"(//.*?)"'
name_pat = '"raw_title":"(.*?)"'
nick_pat = '"nick":"(.*?)"'
price_pat = '"view_price":"(.*?)"'
fee_pat = '"view_fee":"(.*?)"'
sales_pat = '"view_sales":"(.*?)"'
comment_pat = '"comment_count":"(.*?)"'
city_pat = '"item_loc":"(.*?)"'
detail_url_pat = 'detail_url":"(.*?)"'
# 查找满足匹配规则的内容,并存在列表中
imgL = re.compile(img_pat).findall(data)
nameL = re.compile(name_pat).findall(data)
nickL = re.compile(nick_pat).findall(data)
priceL = re.compile(price_pat).findall(data)
feeL = re.compile(fee_pat).findall(data)
salesL = re.compile(sales_pat).findall(data)
commentL = re.compile(comment_pat).findall(data)
cityL = re.compile(city_pat).findall(data)
detail_urlL = re.compile(detail_url_pat).findall(data)
for j in range(len(imgL)):
img = "http:" + imgL[j] # 商品图片链接
name = nameL[j] # 商品名称
nick = nickL[j] # 淘宝店铺名称
price = priceL[j] # 商品价格
fee = feeL[j] # 运费
sales = salesL[j] # 商品付款人数
detail_url = detail_urlL[j] # 商品链接
comment = commentL[j] # 商品评论数,会存在为空值的情况
if (comment == ""):
comment = 0
city = cityL[j] # 店铺所在城市
print('正在爬取第' + str(i) + "页,第" + str(j) + "个商品信息...")
sql = "insert into taobao(name,price,fee,sales,comment,city,nick,img,detail_url) values('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
name, price, fee, sales, comment, city, nick, img, detail_url)
data_Import(sql)
print("爬取完成,且数据已存入数据库")
except Exception as e:
print(str(e))
print("任务完成")