【Python】去哪儿旅游景点数据爬虫

爬虫需要模块:BeautifulSoup、requests

爬虫网站:去哪儿-https://travel.qunar.com/place/

1.爬取城市ID链接

例如:https://travel.qunar.com/p-cs300148-haikou

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import pandas as pd
import requests

def crawer_travel_static_url(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req=requests.get(url,headers=headers)
    content=req.text
    soup=BeautifulSoup(content,'lxml')
    return soup

def crawer_travel_city_id():
    url = 'http://travel.qunar.com/place/'
    soup=crawer_travel_static_url(url)
    cat_url = []
    cat_name = []
    sub_list=soup.find_all('div',attrs={'class':'sub_list'})
    
    for i in range(0,len(sub_list)):
        a_attr = sub_list[i].find_all('a')
        for j in range(0,len(a_attr)):
            cat_name.append(a_attr[j].text)
            cat_url.append(a_attr[j].attrs['href'])
    return cat_name,cat_url

city_name_list,city_url_list=crawer_travel_city_id()
city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
city.to_csv('travel_city.csv',encoding='utf_8_sig',index=False)

2.爬取城市景点id链接

例如:https://travel.qunar.com/p-oi5740424-qiloulaojie

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import pandas as pd
import requests

def crawer_travel_url_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req=requests.get(url,headers=headers)
    content=req.text
    bsObj=BeautifulSoup(content,'lxml')
    return bsObj

def crawer_travel_attraction_url(url):
    #该城市最大景点数
    maxnum = crawer_travel_url_content(url+'-jingdian').find('p',{'class':'nav_result'}).find('span').text
    #提取数字
    maxnum=int(''.join([x for x in maxnum if x.isdigit()]))

    url=url+'-jingdian-1-'
    cat_url = []
    cat_name = []

    # 这里取top10景点 每页10条 page从1开始
    page=2
    # 判断是否超过范围
    if (page-1)*10>maxnum :
        page=int(((maxnum+10)/10)+1)

    for i in range(1,page):
        url1=url+str(i)
        bsObj=crawer_travel_url_content(url1)
        bs=bsObj.find_all('a',attrs={'data-beacon':'poi','target':'_blank'})
        for j in range(0, len(bs)):
            if(bs[j].text!=''):
                cat_name.append(bs[j].text)
                cat_url.append(bs[j].attrs['href'])
    print(cat_name,cat_url)
    print(len(cat_name))
    print(len(cat_url))
    return cat_name, cat_url

#海口举例
url='https://travel.qunar.com/p-cs300148-haikou-jingdian'
city_name_list,city_url_list=crawer_travel_attraction_url(url)
city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
city.to_csv('travel_attraction.csv',encoding='utf_8_sig')

3.爬取景点详细信息

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests
import time

def get_static_url_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = requests.get(url, headers=headers)
    content = req.text
    bsObj = BeautifulSoup(content, 'lxml')
    return bsObj

def get_jd_comment(url):
    # 该景点最大评论数
    maxnum = get_static_url_content(url).find('span', {'class': 'e_nav_comet_num'}).text
    maxnum = int(maxnum)

    poi = ''.join([x for x in url if x.isdigit()])

    cat_user_id = []
    cat_user_name= []
    cat_jd_poi = []
    cat_score = []
    cat_user_comment = []
    cat_comment_time = []

    url = 'http://travel.qunar.com/place/api/html/comments/poi/' + poi + '?poiList=true&sortField=1&rank=0&pageSize=50&page='
    #这里页数暂时设为101,取的pageSize=50,即爬取100*50条评论
    page = 101
    if (page - 1) * 50 > maxnum:
        page = int(((maxnum + 50) / 50)+1)
    for i in range(1, page):
        url1 = url + str(i)
        json_str = requests.get(url1, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}).text
        try:
            json_data=json.loads(json_str)['data']
        except:
            continue
        #print(json_data)
        bsObj = BeautifulSoup(json_data, 'lxml')
        bs=bsObj.find_all('li',{'class':'e_comment_item clrfix'})

        for j in range(0,len(bs)):
            try:
                user=bs[j].find('div', {'class': 'e_comment_usr_name'}).find('a')
                cat_user_id.append(''.join([x for x in user.attrs['href'] if x.isdigit()]))

                cat_user_name.append(user.text)

                cat_jd_poi.append(poi)

                score=''.join([x for x in str(bs[j].find('span',{'class':'total_star'}).find('span')) if x.isdigit()])
                cat_score.append(score)

                a=bs[j].find('div',{'class':'e_comment_content'}).find_all('p')
                cat_user_comment.append(''.join(x.text for x in a))

                cat_comment_time.append(bs[j].find('div',{'class':'e_comment_add_info'}).find('li').text)

            except:
                print('i=',i,'j=',j,'有问题')
        print('已完成poi=',poi,' ',i,'/',page-1)
        time.sleep(3)

    return cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment

# 西湖举例
url = 'http://travel.qunar.com/p-oi708952-xihu'
cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment=get_jd_comment(url)
city=pd.DataFrame({'user_id':cat_user_id,'user_name':cat_user_name,'jd_poi':cat_jd_poi,'score':cat_score,'time':cat_comment_time,'comment':cat_user_comment})
city.to_csv('travel_comment.csv',encoding='utf_8_sig')

【参考资料】


[1]https://zhuanlan.zhihu.com/p/41324232

[2]https://blog.csdn.net/sdozouhao2007/article/details/84404982

### 使用Scrapy框架爬取去哪儿网旅游数据 为了实现去哪儿网旅游数据的爬取,可以基于Python中的`Scrapy`框架构建一个简单的爬虫程序。下面是一个基本的例子来展示如何设置并运行这样一个爬虫。 #### 创建项目结构 首先创建一个新的Scrapy项目: ```bash scrapy startproject qunar_spider cd qunar_spider ``` 接着定义目标URL模式以及要抓取的信息字段,在此案例中假设我们关注的是特定城市的景点列表页面。 #### 编写Spider类 编辑文件 `qunar_spider/spiders/qunar.py`, 添加如下代码片段: ```python import scrapy from ..items import QunarItem # 自定义item用于存储解析后的数据 class TravelSpider(scrapy.Spider): name = "travel" allowed_domains = ["piao.qunar.com"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.start_urls = [] for province in ['北京', '上海']: # 可扩展到更多省份/城市 for page_num in range(0, 5): # 控制每座城巿最多爬取多少页数 url = f"https://piao.qunar.com/ticket/list.htm?keyword={province}&region=&from=mpl_search_suggest&page={page_num}" self.start_urls.append(url) def parse(self, response): items = QunarItem() all_divs = response.css('div.sight_item') # 获取所有景区条目 for div in all_divs: title = div.css('.name::text').extract_first().strip() score = div.css('.score::text').re(r'\d+\.*\d*')[0] if div.css('.score::text').re(r'\d+\.*\d*') else None items['title'] = title items['score'] = float(score) if score is not None else None yield items next_page_link = response.xpath('//a[contains(text(),"下一页")]/@href').get() # 寻找下一页链接 if next_page_link and int(next_page_link.split('=')[-1]) <= 5: # 如果存在且不超过设定的最大翻页次数,则继续访问 absolute_next_page_url = response.urljoin(next_page_link) yield scrapy.Request(url=absolute_next_page_url, callback=self.parse) ``` 这段代码实现了对多个城市多页景点信息的遍历,并提取了每个景点的名字和评分作为样本输出[^1]。注意这里只展示了部分核心逻辑;实际应用时还需要考虑异常处理、反爬机制应对等问题。 此外,还需配置好项目的settings.py以适应具体的环境需求,比如调整下载延迟(`DOWNLOAD_DELAY`)防止触发网站防护措施等。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

浪荡子爱自由

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值