import pandas as pd
import urllib.request as req
import json
import sys
import time
import random
import re
print(sys.getdefaultencoding())
class MTCommentsCrawler:
def __init__(self,productId=None,limit=100,start=0):
self.productId = productId #酒店ID读取
self.limit = limit # 一次获取多少条评论
self.start = start
self.locationLink = 'https://ihotel.meituan.com/api/v2/comments/biz/reviewList'
self.paramValue = {
'referid':self.productId,
'limit':self.limit,
'start':self.start,
}
self.locationUrl = None
self.tagid = 0 #tag
self.re_s = [] #备用列表库
self.re_s_num = 0
# 构造url调用参数
def paramDict2Str(self,params):
str1 = ''
for p,v in params.items():
str1 = str1+ p+'='+str(v)+'&'
print("str1:"+str1)
return str1
# 构造调用url
def concatLinkParam(self):
self.locationUrl = self.locationLink+'?'+self.paramDict2Str(self.paramValue)+'filterid=800&querytype=1&utm_medium=touch&version_name=999.9'
print("url:"+self.locationUrl)
#伪装浏览器进行数据请求
def requestMethodPage(self):
# 伪装浏览器 ,打开网站
headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36',
'Referer':'https://i.meituan.com/awp/h5/hotel-v2/feedback/index.html?poiId=%d'%(self.productId),
'Host':'ihotel.meituan.com'
}
url = self.locationUrl
print('url : ',url)
reqs = req.Request(url,headers=headers)
return reqs
# 读取服务端获取的数据,返回json格式
def showListPage(self):
request_m = self.requestMethodPage()
conn = req.urlopen(request_m)
return_str = conn.read().decode('utf-8')
return json.loads(return_str)
# 将评论数据保存到本地
def save_csv(self,df):
# 保存文件
df.to_csv(path_or_buf = 'F:\数据\c_%d.csv'%self.productId,sep=',',header=True,index=True,encoding='utf_8_sig')
print("保存成功!")
def save_txt(self,df):
df.to_csv(path_or_buf = 'F:\数据\c_%d.txt'%self.productId,sep=',',header=False,index=False,encoding='utf-8')
print("保存成功!")
# 移除换行符,#,表情,重复字段(超过文本内容一半)
def remove_emoji(self,text):
text = text.replace('\n','')
text = text.replace('#','')
rule = lambda s:len(set(s))/len(s)>0.5 #使用匿名函数进行判断
results = filter(rule, text) #filter过滤
time_list = list(results) #转换为列表
time_str = "".join(time_list) #转换为str
# print(text)
# print(time_str)
#过滤重复文本
file_open = open(r"f:\数据\1.txt","r",encoding='utf-8') #过滤文本
file_open2 = time_str
s = file_open.read().splitlines()
ss = file_open2
ss_list = ss.splitlines()
result = []
for i in s:
#用正则表达式去匹配
regular = "([0-9A-Za-z,]*{0}[0-9A-Za-z,]*)".format(i)
result1 = re.findall(regular,ss)
result = result + result1
result_2 = " ".join(list(set(ss_list) - set(result)))
file_open.close()
# file.close()
try:
highpoints = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
if len(result_2) <=5:
return ''
for i in s:
if i in result_2:
return ''
return highpoints.sub(u'',result_2)
#抓取数据
def crawler(self):
# 把抓取的数据存入CSV文件,设置时间间隔,以免被屏蔽
self.tagid += 1
print("diyici:" , self.tagid)
json_info = self.showListPage()
tmp_list = []
tmp_text_list =[]
#print(json_info)
Data = json_info['Data']
comments =Data['List']
for com in comments:
text = self.remove_emoji(com['Content'])
self.re_s.append(text)
for com in comments:
if self.remove_emoji(com['Content']) == '': #判断是否为短信息
continue
# self.re_s.append(self.remove_emoji(com['Content']))
# for i in self.re_s:
# if i in self.re_s:
# return ''
text = self.remove_emoji(com['Content'])
if self.re_s.count(text) >=2: #统计重复信息num
self.re_s.remove(text) #删除多余重复信息
continue
tmp_list.append([self.tagid,text])
tmp_text_list.append([text])
df = pd.DataFrame(tmp_list,columns=['Tag','content'])
self.save_csv(df) #保存为csv
df = pd.DataFrame(tmp_text_list,columns=['content'])
self.save_txt(df) #保存为txt
#初始化参数
def mtComment():
#设置关键变量
productIdGroup = [4191299] # ID
limit = 100
for productId in productIdGroup:
start = random.randint(1,9)
MTC = MTCommentsCrawler(productId,limit,start)
MTC.concatLinkParam()
MTC.crawler()
time.sleep(random.randint(1,5)) #没爬取一次,休息30到50秒
if __name__ == '__main__':
mtComment()
美团 -python
最新推荐文章于 2024-05-04 13:31:46 发布