import requests
import re
import json
import time
from bs4 import BeautifulSoup
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
#获取一页编号
def get_bianhao(url):
res=requests.get(url,headers=headers,timeout=20)
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,'html.parser')
bianhao=soup.select('#J_goodsList > ul > li > div > div.p-img > a')
bianhao1=list(map(lambda x:x['href'],bianhao))
bianhaos=[]
for bh in bianhao1:
bianhao2=re.findall('//item.jd.com/(\d+).html',bh)
if bianhao2 != []:
bianhaos.append(bianhao2[0])
else:
pass
return bianhaos
#获取笔记本标题和品牌
total=[]
def get_shangpin(url):
res=requests.get(url,headers=headers,timeout=20)
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,'html.parser')
try:
title=soup.select('body > div > div > div.itemInfo-wrap > div.sku-name')
title1=title[0].text.strip()
pinpai=soup.select('#parameter-brand > li > a')
pinpai1=pinpai[0].text.strip()
total.append({'标题':title1})
total.append({'品牌':pinpai1})
return total
except:
total.append({'标题':'空'})
total.append({'品牌':'空'})
return total
#获取笔记本价格
def get_jiage(url):
rj=requests.get(url,headers=headers,timeout=20)
jdj=json.loads(rj.text.lstrip('jQuery2224728([').rstrip(']);\n'))
total.append({'价格':jdj['p']})
return total
#获取笔记本评论数
def get_pinglun(url):
r=requests.get(url,headers=headers,timeout=20)
jd=json.loads(r.text.strip('\''))
jd1=jd['CommentsCount']
for com in jd1:
total.append({'评论':com['CommentCountStr']})
return total
#主页链接,商品编号,代入到价格和评论数url
if __name__ == '__main__':
for i in range(1,11):
URL='https://search.jd.com/search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V01&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(2 * i - 1)
for id in get_bianhao(URL):
URLS='https://item.jd.com/'+id+'.html'
print(id)
get_shangpin(URLS)
URLJ='https://p.3.cn/prices/mgets?callback=jQuery2224728&type=1&area=1_72_2799_0&pdtk=&pduid=401650232&pdpin=&pin=null&pdbp=0&skuIds=J_'+id+'%2C'
get_jiage(URLJ)
URLP='https://club.jd.com/comment/productCommentSummaries.action?referenceIds='+id
get_pinglun(URLP)
time.sleep(0.5)
print(total)
import numpy as np
import pandas as pd
df=pd.DataFrame(total)
#df.to_excel('jingdongbijiben.xls')
a=df['品牌'].dropna().values
b=df['价格'].dropna().values
c=df['评论'].dropna().values
d=df['标题'].dropna().values
vv=np.vstack((a,b,c,d)).T
df1=pd.DataFrame(vv,columns=['品牌','价格','评论','标题'])
df1.to_excel('笔记本.xls')
python爬取京东笔记本标题、品牌、价格、评论数
最新推荐文章于 2024-05-10 12:20:45 发布