python爬虫实战1
import requests
import pandas as pd
from bs4 import BeautifulSoup
import datetime
from itertools import groupby
import math
if __name__=='__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.5162 SLBChan/111',
'Accept': '*/*',
'Host': '',
'Connection': 'keep-alive'
}
pathStr = "网址"
paramStr = '参数'
pathList = [paramStr+str(1)
res = requests.get(pathStr+pathList[0],headers=headers)
#
soup = BeautifulSoup(res.text,'html.parser')
time=[]
place=[]#产地
price=[]#价格
product=[]#品种
lifting=[]#升降
pageNum = 15
# data_list = soup.find_all('div',"quotation-content-list")
# 查看总条数
total = soup.find('span','eye-pagination__total').text
# 分割数字和文字
ret = [''.join(list(g)) for k, g in groupby(total, key=lambda x: x.isdigit())]
# num = int(ret[1])%15
num =math.ceil(int(ret[1])/15)
print(num)
# 生成请求路径数组,实现自动翻页爬取数据,这里根据自己爬取网页的翻页特点来设置
for i in range(num+1):
if i<=1:
continue
else:
newPath = paramStr+str(i)
print(newPath)
pathList.append(newPath)
# print(pathList)
#循环遍历生成的网址
for pathi in pathList:
res = requests.get(pathStr+pathi+'/',headers=headers)
soup = BeautifulSoup(res.text,'html.parser')
data_list = soup.findAll('li','market-list-item')
#自动翻页获取数据
for data in data_list:
# 获取时间列
try:
timeText = data.find('span', {'class': 'time'}).text.strip()
time.append(timeText)
except:
time.append('N/A')
# 获取产地列
try:
placeText = data.find('span', {'class': 'place'}).text.strip()
place.append(placeText)
except:
place.append('N/A')
# 获取价格列
try:
priceText = data.find('span', {'class': 'price'}).text.strip()
price.append(priceText)
except:
price.append('N/A')
# 获取品种列
try:
productText = data.find('span', {'class': 'product'}).text.strip()
product.append(productText)
except:
product.append('N/A')
# 获取升降列
try:
liftingText = data.find('span', {'class': 'lifting'}).text.strip()
lifting.append(liftingText)
except:
lifting.append('N/A')
df = pd.DataFrame({'时间': time,
'产地': place,
'品种': product,
'价格': price,
'升降': lifting
})
print(df)
df.to_excel(datetime.date.today().strftime('%Y%m%d')+"内蒙古地区牛价格数据信息.xlsx")
- 该爬虫代码仅用于个人学习,不做商业开发