代码如下:目前
import requests
from bs4 import BeautifulSoup
import os
import shutil
import pandas as pd
# 网页的地址
def fetchHotel(url):
# 请求头 浏览器类型
headers = {
"User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0",
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'}
# 发起网络请求,请求方式是 get
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
# html = r.text
# print(r.text)
return r.text
def getPageNum(html):
# 获取总页数
pageNum = 1
bsObj = BeautifulSoup(html,"html.parser")
# print(bsObj)
pageList = bsObj.find("div", attrs={"id": "pager-container"}).find_all("a")
if pageList:
pageNum = pageList[-2].text
# print(pageNum)
return int(pageNum)
def parseHtml(html):
# 解析html网页,提取数据
bsObj = BeautifulSoup(html, "html.parser")
bookList = bsObj.find("div", attrs={"class": "result_list"})
books = []
for book in bookList:
# 单个景点介绍
# single = book.find("div", attrs={"class": "result_list"})
name = book.h3.a.text
# 景点详情
intro = book.find("div",attrs={"class" : "sight_item_about"})
# 推荐相关景点
rely = book.find("div",attrs = {"class" : "clrfix"})
# 景点价格
# salary = book.find("span", attrs = {"class" : "sight_item_price"})
# name = single.find("a", attrs={"class": "name"}).text
level = intro.find('span', attrs = {"class": "level"})
if level:
level = level.text
if level is None:
level = "未知"
area = book.find("span", attrs={"class": "area"}).text
address = book.find("p", attrs={"class": "address"})
if address:
address = address.span.text
if address is None:
address = "未知"
introduction = intro.find('div',attrs={"class": "intro"}).text
if introduction:
introduction = introduction
if introduction is None:
introduction = ""
# 景区价格
price = book.find("span", attrs = {"class" : "sight_item_price"})
if price:
price = price.text
if price is None:
price = "未知"
relation = book.find('span',attrs={'class' : 'relation_count'})
if relation:
relation = str(relation.text)
if relation is None:
relation = "0"
books = [
[name, level, area, address, introduction, price, relation]]
yield books
def saveCsvFile(filename, content):
import pandas as pd
# 保存文件
dataframe = pd.DataFrame(content)
dataframe.to_csv(filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)
def downloadBookInfo(url, fileName):
head = [
['景点名','景点等级','地区','景点具体地址','景点介绍','价格','相关景点个数']]
saveCsvFile(fileName, head)
html = fetchHotel(url)
pageNum = getPageNum(html)
for page in range(1, pageNum + 1):
print("正在爬取", str(page), "页 .......")
url = "https://piao.qunar.com/ticket/list.htm?keyword=%E5%AE%89%E9%98%B3®ion=&from=mpl_search_suggest&page=" + str(page)
html = fetchHotel(url)
for book in parseHtml(html):
saveCsvFile(fileName, book)
url = 'https://piao.qunar.com/ticket/list.htm?keyword=%E5%AE%89%E9%98%B3®ion=&from=mpl_search_suggest&page=1'
r = requests.get(url)
fileName = "安阳景点.csv"
# # 判断安阳景点.csv是否存在,存在则删除,不存在不进行操作,等到程序执行到saveCsvFile()时自动创建
# if os.path.exists(r'data'):
# shutil.rmtree(r'data')
downloadBookInfo(url, fileName)
print("全部完成!")
目前代码能够运行,但存在一些瑕疵:
1.数据爬取到第6页或者第9页的时候会出现这样的错误,但是如果在saveCsvFile(fileName, book)打上断点有概率正常运行,不知原因
2.偶尔会在def getPageNum(html)函数中报错:'NoneType' object is not iterable,不知原因
3.想实现检测文件夹中是否已存在"安阳景点.csv",若存在则删除,若不存在则正常执行。没有实现