尝试爬取去哪儿网安阳景点数据（微瑕版）

咻咻咻383

已于 2024-04-15 15:19:07 修改

阅读量240

点赞数 1

文章标签： python 网络爬虫

于 2024-04-12 16:50:09 首次发布

本文链接：https://blog.csdn.net/qq_65439696/article/details/137686753

版权

本文介绍了使用Python的requests和BeautifulSoup库爬取去哪儿网旅游景点信息的示例，包括网页抓取、数据解析、分页处理和CSV文件的动态创建或更新。同时提及了代码存在的问题和改进需求，如数据抓取不稳定和文件管理部分的完善。

摘要由CSDN通过智能技术生成

代码如下：目前

import requests
from bs4 import BeautifulSoup
import os
import shutil
import pandas as pd
# 网页的地址
def fetchHotel(url):
    # 请求头 浏览器类型
    headers = {
        "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0",
        'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'}
    # 发起网络请求，请求方式是 get
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    # html = r.text
    # print(r.text)
    return r.text

def getPageNum(html):
    # 获取总页数
    pageNum = 1
    bsObj = BeautifulSoup(html,"html.parser")
    # print(bsObj)
    pageList = bsObj.find("div", attrs={"id": "pager-container"}).find_all("a")
    if pageList:
        pageNum = pageList[-2].text
    # print(pageNum)
    return int(pageNum)

def parseHtml(html):
    # 解析html网页，提取数据
    bsObj = BeautifulSoup(html, "html.parser")
    bookList = bsObj.find("div", attrs={"class": "result_list"})
    books = []
    for book in bookList:
        # 单个景点介绍
        # single = book.find("div", attrs={"class": "result_list"})
        name = book.h3.a.text
        # 景点详情
        intro = book.find("div",attrs={"class" : "sight_item_about"})
        # 推荐相关景点
        rely = book.find("div",attrs = {"class" : "clrfix"})
        # 景点价格
        # salary = book.find("span", attrs = {"class" : "sight_item_price"})
        # name = single.find("a", attrs={"class": "name"}).text
        level = intro.find('span', attrs = {"class": "level"})
        if level:
            level = level.text
        if level is None:
            level = "未知"
        area = book.find("span", attrs={"class": "area"}).text
        address = book.find("p", attrs={"class": "address"})
        if address:
            address = address.span.text
        if address is None:
            address = "未知"
        introduction = intro.find('div',attrs={"class": "intro"}).text
        if introduction:
            introduction = introduction
        if introduction is None:
            introduction = ""
        # 景区价格
        price = book.find("span", attrs = {"class" : "sight_item_price"})
        if price:
            price = price.text
        if price is None:
            price = "未知"
        relation = book.find('span',attrs={'class' : 'relation_count'})
        if relation:
            relation = str(relation.text)
        if relation is None:
            relation = "0"
        books = [
            [name, level, area, address, introduction, price, relation]]
        yield books

def saveCsvFile(filename, content):
    import pandas as pd
    # 保存文件
    dataframe = pd.DataFrame(content)
    dataframe.to_csv(filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)


def downloadBookInfo(url, fileName):
    head = [
        ['景点名','景点等级','地区','景点具体地址','景点介绍','价格','相关景点个数']]
    saveCsvFile(fileName, head)
    html = fetchHotel(url)
    pageNum = getPageNum(html)
    for page in range(1, pageNum + 1):
        print("正在爬取", str(page), "页 .......")
        url = "https://piao.qunar.com/ticket/list.htm?keyword=%E5%AE%89%E9%98%B3&region=&from=mpl_search_suggest&page=" + str(page)
        html = fetchHotel(url)
        for book in parseHtml(html):
            saveCsvFile(fileName, book)


url = 'https://piao.qunar.com/ticket/list.htm?keyword=%E5%AE%89%E9%98%B3&region=&from=mpl_search_suggest&page=1'
r = requests.get(url)
fileName = "安阳景点.csv"
# # 判断安阳景点.csv是否存在，存在则删除，不存在不进行操作，等到程序执行到saveCsvFile()时自动创建
# if os.path.exists(r'data'):
# 	shutil.rmtree(r'data')
downloadBookInfo(url, fileName)
print("全部完成！")

目前代码能够运行，但存在一些瑕疵：

1.数据爬取到第6页或者第9页的时候会出现这样的错误，但是如果在saveCsvFile(fileName, book)打上断点有概率正常运行，不知原因

2.偶尔会在def getPageNum(html)函数中报错：'NoneType' object is not iterable，不知原因

3.想实现检测文件夹中是否已存在"安阳景点.csv"，若存在则删除，若不存在则正常执行。没有实现