python爬虫电商京东_python爬虫爬取京东商品信息

最新推荐文章于 2023-12-11 14:24:55 发布

weixin_39771775

最新推荐文章于 2023-12-11 14:24:55 发布

阅读量242

点赞数

文章标签： python爬虫电商京东

#!/usr/bin/env python#-*- coding: utf-8 -*-#@Time : 2020/5/7 11:17#@Author : dddchongya#@Site :#@File : ComputerFromJD.py#@Software: PyCharm

importrequestsfrom bs4 importBeautifulSoup as bstimportjsonimportos

informationnumber=0defGetComment(id):

param={'callback': 'fetchJSON_comment98','productId': id,'score': 0,'sortType': 5,'page': 1,'pageSize': 10,'isShadowSku': 0,'rid': 0,'fold': 1,

}

url="https://club.jd.com/comment/productPageComments.action"headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',#标记了请求从什么设备，什么浏览器上发出

}

CommentLs={}

bool=1label=[]

comments=[]

commentnumber={}for i in range(1,5):

param["page"]=i

res_songs= requests.get(url, params=param, headers=headers)

jsondata=res_songs.text

jsondata= json.loads(jsondata.replace("(", "").replace(")", "").replace("fetchJSON_comment98", "").replace(" ","").replace(";", ""))if bool ==1:#标签只用拿一次

hotCommentTagStatistics=jsondata["hotCommentTagStatistics"]for j inhotCommentTagStatistics:

label.append(j["name"]+":"+str(j["count"]))#评论数量也只用拿一次

productCommentSummary = jsondata["productCommentSummary"]

commentnumber["commentCount"]=productCommentSummary["commentCount"]

commentnumber["defaultGoodCount"] = productCommentSummary["defaultGoodCount"]

commentnumber["goodCount"] = productCommentSummary["goodCount"]

commentnumber["poorCount"] = productCommentSummary["poorCount"]

commentnumber["generalCount"] = productCommentSummary["generalCount"]

commentnumber["afterCountStr"] = productCommentSummary["afterCount"]

commentnumber["showCount"] = productCommentSummary["showCount"]

bool=bool+1comment=jsondata["comments"]for j incomment:

comments.append(j["content"].replace("\n",""))

CommentLs["commentnumber"]=commentnumber

CommentLs["label"]=label

CommentLs["comments"]=commentsreturnCommentLsdefGetMoreInformation(id):

url="https://item.jd.com/"+id+".html"headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',#标记了请求从什么设备，什么浏览器上发出

}

res= requests.get(url, headers=headers)

html=bst(res.content)defGetGoodResone(LsComputer):

labells=[]

label=set()

labellist={}for i inLsComputer:if (i['comments']['commentnumber']['goodCount'] + i['comments']['commentnumber']['defaultGoodCount']) / float(i['comments']['commentnumber']['commentCount']) > 0.7:

labells.append(i['comments']["label"])for i inlabells:for j ini:

label.add(j.split(":")[0])for i inlabel:

labellist[i]=0for j inlabells:for k inj:

labellist[k.split(":")[0]] = labellist[k.split(":")[0]] + float(k.split(":")[1])

result= sorted(labellist.items(), key=lambda x: x[1], reverse=False)

with open(os.getcwd()+ '\好评过七十的标签排行.txt', 'w', encoding="utf-8") as f:for i inresult:

f.write(str(i))

f.write('\r\n')

f.close()defGetMaxSalesShop(LsComputer):

shop=set()for i inLsComputer:

shop.add(i["ShopName"])

shopcount={}

shopsalecount={}

shopprice={}for i inshop:

shopcount[i]=0

shopsalecount[i]=0

shopprice[i]=[]for i inshop:for j inLsComputer:if j["ShopName"]==i:if j["Price"].__len__()>=5:

price=j["Price"][0:-3].replace("\n","").replace(" ","").replace("\t","")#销售额

shopcount[i]=shopcount[i]+j["comments"]["commentnumber"]["commentCount"]*float(price)#价格总和，为了求平均数

shopprice[i].append(price)#销售量

shopsalecount[i]=shopsalecount[i]+j["comments"]["commentnumber"]["commentCount"]

shopprice2={}for i inshopprice:

sum=0if shopprice[i].__len__() !=0:for j inshopprice[i]:

sum=sum+float(j)

price=sum/(shopprice[i].__len__())

shopprice2[i]=priceprint()print()

result=sorted(shopcount.items(), key=lambda x: x[1], reverse=False)print("销售额排行::")for i inresult:print(i)

with open(os.getcwd()+ '\销售额排行.txt', 'w', encoding="utf-8") as f:for i inresult:

f.write(str(i))

f.write('\r\n')

f.close()print()print()

result= sorted(shopprice2.items(), key=lambda x: x[1], reverse=False)print("销售量排行::")for i inresult:print(i)

with open(os.getcwd()+ '\销售量排行.txt', 'w', encoding="utf-8") as f:for i inresult:

f.write(str(i))

f.write('\r\n')

f.close()print()print()

result= sorted(shopsalecount.items(), key=lambda x: x[1], reverse=False)print("平均价格排行::")for i inresult:print(i)

with open(os.getcwd()+ '\平均价格排行.txt', 'w', encoding="utf-8") as f:for i inresult:

f.write(str(i))

f.write('\r\n')

f.close()#可任意写搜索链接

url = 'https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%94%B5%E8%84%91&page='headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',#标记了请求从什么设备，什么浏览器上发出

}#伪装请求头

LsComputer=[]#bool=1 # 每页开头第一个商品格式有误差，所以以此为判断符号跳过第一个

for k in range(1,10):

url=url+str(k*2+1)

res= requests.get(url, headers=headers)

html=bst(res.content)

list=html.findAll("li",{"class","gl-item gl-item-presell"})for html inlist:

ComputerInformation={}

CustomUrl=html.find("div",{"class","p-img"}).find("a").get("href")if not str(CustomUrl).__contains__("https:"):

CustomUrl="https:"+CustomUrl#print(CustomUrl)

id=html.find("div",{"class","p-price"}).find("strong").get("class")

id=id[0].replace("J","").replace("_","")#拿到评论信息

Comments=GetComment(id)#print(Comment)

#进入页面拿更详细的信息

ImgUrl="https:"+str(html.find("div",{"class","p-img"}).find("img").get("source-data-lazy-img"))#print(ImgUrl)

Price=str(html.find("div",{"class","p-price"}).find("i"))[3:-4]#print(Price[3:-4])

Describe=str(html.find("div",{"class","p-name p-name-type-2"}).find("em").getText())#print(Describe)

#第一行一个会为空

ShopName=html.find("div",{"class","p-shop"}).find("a")if ShopName !=None:

ShopName=str(ShopName.getText())#print(ShopName)

#店铺描述可能有多个

Mode=html.find("div",{"class","p-icons"}).findAll("i")

BusinessMode=[]for i inMode:

BusinessMode.append(i.getText())#print(BusinessMode)

ComputerInformation["CustomUrl"]=CustomUrl

ComputerInformation["ImgUrl"] =ImgUrl

ComputerInformation["Price"] =Price

ComputerInformation["Describe"] =Describe

ComputerInformation["ShopName"] =ShopName

ComputerInformation["CustomUrl"] =CustomUrl

ComputerInformation["BusinessMode"] =BusinessMode

ComputerInformation["comments"]=Comments

LsComputer.append(ComputerInformation)for k in range(1,10):

url=url+str(k*2+1)

res= requests.get(url, headers=headers)

html=bst(res.content)

list=html.findAll("li",{"class","gl-item"})for html inlist:

ComputerInformation={}

CustomUrl=html.find("div",{"class","p-img"}).find("a").get("href")if not str(CustomUrl).__contains__("https:"):

CustomUrl="https:"+CustomUrl#print(CustomUrl)

id=html.find("div",{"class","p-price"}).find("strong").get("class")

id=id[0].replace("J","").replace("_","")#拿到评论信息

Comments=GetComment(id)#print(Comment)

#进入页面拿更详细的信息

ImgUrl="https:"+str(html.find("div",{"class","p-img"}).find("img").get("source-data-lazy-img"))#print(ImgUrl)

Price=str(html.find("div",{"class","p-price"}).find("i"))[3:-4]#print(Price[3:-4])

Describe=str(html.find("div",{"class","p-name p-name-type-2"}).find("em").getText())#print(Describe)

#第一行一个会为空

ShopName=html.find("div",{"class","p-shop"}).find("a")if ShopName !=None:

ShopName=str(ShopName.getText())#print(ShopName)

#店铺描述可能有多个

Mode=html.find("div",{"class","p-icons"}).findAll("i")

BusinessMode=[]for i inMode:

BusinessMode.append(i.getText())#print(BusinessMode)

ComputerInformation["CustomUrl"]=CustomUrl

ComputerInformation["ImgUrl"] =ImgUrl

ComputerInformation["Price"] =Price

ComputerInformation["Describe"] =Describe

ComputerInformation["ShopName"] =ShopName

ComputerInformation["CustomUrl"] =CustomUrl

ComputerInformation["BusinessMode"] =BusinessMode

ComputerInformation["comments"]=Comments

LsComputer.append(ComputerInformation)#数据写入文件

with open(os.getcwd() + '\json.txt', 'w',encoding="utf-8") as f:for i inLsComputer:

f.write(json.dumps(i,indent=4,ensure_ascii=False))

f.close()

GetMaxSalesShop(LsComputer)

weixin_39771775

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python爬虫电商京东_python爬虫爬取京东商品信息

#!/usr/bin/env python#-*- coding: utf-8 -*-#@Time : 2020/5/7 11:17#@Author : dddchongya#@Site :#@File : ComputerFromJD.py#@Software: PyCharmimportrequestsfrom bs4 importBeautifulSoup as bsti...
复制链接

扫一扫