【python 爬虫】携程，去哪儿评论爬虫

最新推荐文章于 2024-08-08 16:58:35 发布

置顶东华果汁哥

最新推荐文章于 2024-08-08 16:58:35 发布

阅读量3.4k

点赞数 3

分类专栏：数据科学--python

本文链接：https://blog.csdn.net/u013421629/article/details/90042516

版权

数据科学--python 专栏收录该内容

289 篇文章 34 订阅

订阅专栏

1、去哪儿

# -*- coding:utf-8 -*-

import re
import json
import requests
import pandas as pd
date=[]
content=[]
for i in range(1,1000):
    try:
        print("正在抓取第"+str(i)+"页")
        url="https://touch.piao.qunar.com/touch/queryCommentsAndTravelTips.json?type=mp&pageSize=10&fromType=SIGHT&pageNum="+str(i)+"&sightId=5759&tagType=0"
        html=requests.get(url).text
        html=json.loads(html)
        data=html['data']
        # print(data)
        commentList=data['commentList']
        # print(commentList)
        for each in commentList:
            # print(each)
            content1=each['content']
            txt = re.sub("♬&#x20;", "", content1)
            date1=each['date']
            date.append(date1)
            content.append(content1)
    except:
        pass



result=pd.DataFrame({'date:':date,'content':content})

result.to_csv('F:/qunaer.csv',index=False)

2、携程

# -*- coding:utf-8 -*-
import re
import requests
import json

date=[]
comment=[]
import pandas as pd


for i in range(1,130):
    try:
        print('正在抓取第'+str(i)+"页")
        url="https://m.ctrip.com/restapi/soa2/10491/json/GetCommentListAndHotTagList?_fxpcqlniredt=09031014411533277785"

        data1={
            "BusinessId":"20485",
            "BusinessType":"11",
            "ChannelType":"7",
            "CommentTagId":"0",
            "ImageFilter":"false",
            "PageIndex":int(i),
            "PageSize":"10",
            "PoiId":"0",
            "SortType":"3",
            "StarType":"0",
            "TouristType":"0",
            "VideoImageHeight":"392",
            "VideoImageWidth":"700"
        }


        data2={
            "auth":"null",
            "cid":"09031014411533277785",
            "ctok":"",
            "cver":"1.0",
            "lang":"01",
            "sid":"8888",
            "syscode":"09"
        }


        data3={
            "lang":"01",
            "sid":"8888",
            "syscode":"09"
        }

        data={
            "CommentResultInfoEntity":data1,
            "head":data2
        }

        html=requests.post(url,data=json.dumps(data)).text
        # print(html)

        html=json.loads(html)

        CommentResult=html['CommentResult']
        # print(CommentResult)
        CommentInfo=CommentResult['CommentInfo']
        for each in CommentInfo:
            # print(each)
            Content=each['Content']
            Content = re.sub("♬&#x20;", "", Content)
            print(Content)
            PlayYear=each['PlayYear']
            PlayMonth=each['PlayMonth']
            PlayDay=each['PlayDay']
            date1=str(PlayYear)+'-'+str(PlayMonth)
            # print(str(PlayYear)+'-'+str(PlayMonth))
            comment.append(Content)
            date.append(date1)

    except:
        pass


result=pd.DataFrame({'date:':date,'content':comment})

result.to_csv('F:/xiecheng.csv',index=False)