马蜂窝采集
注意:马蜂窝的数据不管评价数量多少,最终采集到的最多75条(网页上也只显示这么多),如下图所示:
import requests
import time
import re
import os
poi = int(input("请输入你想要爬取的poi:"))
comment_poi_url = 'http://pagelet.mafengwo.cn/poi/pagelet/poiCommentListApi?'
headers = {
'Referer': 'http://www.mafengwo.cn/poi/%d.html'%(poi),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
}
def get_page():
session = requests.session()
try:
for num in range(1, 5):
requests_data = {
'params': '{"poi_id":"%d","page":"%d","just_comment":1}' % (poi, num)
}
res = session.get(url=comment_poi_url, headers=headers, params=requests_data, timeout = 2)
#encode编码 调用这个方法的对象是str类型 decode解码 调用这个方法的对象是bytes类型
# print(type(res.text) is str) #True
# print(res.text.encode().decode('unicode-escape').encode('utf-8','ignore').decode('utf-8'))
# print(type(res.content) is bytes) #True
# print(res.content.decode('unicode-escape').encode('utf-8','ignore').decode('utf-8'))
page = res.text.encode().decode('unicode-escape').encode('utf-8','ignore').decode('utf-8')
page = page.replace('\\/', '/').replace("<br />","").replace(" ","").replace("\n","").replace("\r","")
#print(page)
#这里可以不使用正则 使用 BeautifulSoup的findAll 之后.text获取内容
date_pattern = r'<aclass="btn-comment_j_comment"title="添加评论">评论</a><spanclass="time">(.*?)</span>'
json_dates = re.compile(date_pattern).findall(page)
star_pattern = r'<spanclass="s-stars-star(\d)"></span>'
json_stars = re.compile(star_pattern).findall(page)
comment_pattern = r'<pclass="rev-txt">(.*?)</p>'
json_comments = re.compile(comment_pattern).findall(page)
for num in range(0, len(json_dates)):
json_date = json_dates[num]
json_star = json_stars[num]
json_comment = json_comments[num]
seq = json_date, json_star, json_comment
name.write('\t'.join(seq) + '\n')
except:
print('获取网页失败')
if __name__ == '__main__':
outfile = './Mafengwo_Comment.txt'
name = open(outfile, 'w',encoding='utf-8')
print('开始写入文件'.center(20,'-'))
seq = '日期列表','星级列表','评论列表'
name.write('\t'.join(seq) + '\n')
get_page()
name.close()
print('写入完毕'.center(20,'-'))
time.sleep(1.2)