工作需要需获取京东商品信息,自己研究了下,有不对的请指正
代码运行结果如图

1.写好整个爬虫框架
depth=2 #搜索几页
skulist=[] #用来存储获取的信息
for i in range(depth):
try:
url='https://search.jd.com/Search?keyword=内裤&page='+str(i) #爬取的链接
html=gethtml(url) #获取网页源代码,详见第二步
put=getre(skulist,html) #从源代码中获取所需信息,详见第三步
except:
continue
outputxls(skulist) #将获取的信息存入excel中,并保存到电脑相应位置,详见第四步
2.用requests库获取网页源代码
import requests
def gethtml(html):
try:
r=requests.get(html)
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
print(r.text)
except:
print("源代码获取失败")
3.正则表达式获取需要的相关信息
import re
def getre(skulist,html):
#用正则匹配店铺名、价格、商品链接、主图
shop_list=re.findall('title=".*?">(.*?)</a></span>',html)
price_list=re.findall('<em>¥</em><i>(.*?)</i>',html)
url_list=re.findall('data-url="(.*?)"\s*data-presale',html)
pic_list=re.findall('<img width="220" height="220" data-img="1" src="(.*?)" data-lazy-img',html)
#将匹配到的信息装进skulist列表中
for i in range(len(shop_list)):
shop=shop_list[i]
price=price_list[i]
url=url_list[i]
pic=pic_list[i]
skulist.append([shop,price,url,pic])
4.把获取的信息写入excel中并保存到电脑
import os
import xlwt
def outputxls(put):
#创建excel文件,sheet名为info
workbook_info=xlwt.Workbook()
sheet_info=workbook_info.add_sheet('info')
m=1
#将skulist中的信息按行列写进excel文件中
for num in range(len(put)):
for i in range(len(put[num])):
sheet_info.write(num,i,label=put[num][i])
m+=1
#将文件保存到c盘,文件名为list
workbook_info.save(r'c:\list.xls')
完整代码
import requests
import re
import os
import xlwt
def gethtml(html):
try:
r=requests.get(html)
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
print(r.text)
except:
print("源代码获取失败")
def getre(skulist,html):
shop_list=re.findall('title=".*?">(.*?)</a></span>',html)
price_list=re.findall('<em>¥</em><i>(.*?)</i>',html)
url_list=re.findall('data-url="(.*?)"\s*data-presale',html)
pic_list=re.findall('<img width="220" height="220" data-img="1" src="(.*?)" data-lazy-img',html)
for i in range(len(shop_list)):
shop=shop_list[i]
price=price_list[i]
url=url_list[i]
pic=pic_list[i]
skulist.append([shop,price,url,pic])
def outputxls(put):
workbook_info=xlwt.Workbook()
sheet_info=workbook_info.add_sheet('info')
m=1
for num in range(len(put)):
for i in range(len(put[num])):
sheet_info.write(num,i,label=put[num][i])
m+=1
workbook_info.save(r'c:\list.xls')
depth=2
skulist=[]
for i in range(depth):
try:
url='https://search.jd.com/Search?keyword=内裤&page='+str(i)
html=gethtml(url)
put=getre(skulist,html)
except:
continue
outputxls(skulist)