这次是爬取艺龙酒店的评论并写入数据库和txt文档
这次爬取和上一篇爬取的孔夫子旧书网差不多,这次是爬取的是所有的酒店的所有评论。
在本次爬取写入数据库时,出现了一些问题,就是评论中的表情存不进去数据库。如果你也遇到了此类问题,可以参考以下我的处理方法:
#删除颜表情
def remove_emoji(comment,restr=''):
#过滤表情
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, comment)
下面就是整体代码了:
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import json
import MySQLdb
import re
import time
#解决出现的写入错误
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print('连接到mysql服务器...')
conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123mysql', db='onefive',charset='utf8')
print('连接上了!')
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS comment3")
sql = """CREATE TABLE comment3(title CHAR(100),
author CHAR(50),
comment VARCHAR(600),
`time` CHAR(20))default CHARset utf8mb4;"""
cur.execute(sql)
conn.commit()
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'
headers = {'User-Agent': user_agent}
#删除颜表情
def remove_emoji(comment,restr=''):
#过滤表情
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, comment)
def get_id():
a2 = []
for k in range(1,10):
url = 'http://hotel.elong.com/ajax/tmapilist/asyncsearch'
headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Content-Length': '1641',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Referer': 'http://hotel.elong.com/beijing/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'X-Requested-With': 'XMLHttpRequest'}
data = {'code':'-99','listRequest.pageIndex':k}
r = requests.post(url, data=data, headers=headers)
a = json.loads(r.text)
id = a['value']['hotelIds']
id = id.split(',')
a2.append(id)
return a2
def get_title():
w = get_id()
for url_1 in w:
for i in url_1:
url_0 = 'http://hotel.elong.com/beijing/'+ i +'/'
print url_0
html = requests.get(url_0,headers=headers).content
soup = BeautifulSoup(html,'html.parser')
title_1 = soup.find('div',attrs={'class':'t24 yahei'})
title = title_1.get_text().replace('\n','')
cur.execute("INSERT INTO comment3(title) VALUES ('%s');" % (title))
conn.commit()
print title
page = 1
while page!=0:
print '第%s页'%(page)
url_2 = 'http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=' + i +'&recommendedType=0&pageIndex=' + str(page-1) + '&mainTagId=0&subTagId=0&rankType=0&eToken=4625b426-2601-4225-81fb-18baf1dea72a&code=9024905&_=1540299461025'
html = requests.get(url_2,headers=headers).content
b = json.loads(html)
all_1 = b['value']
all_2 = all_1['Comments']
for j in all_2:
author = j['CommentUser']['NickName']
comment_1 = j['Content']
time = j['CreateTime']
comment = remove_emoji(comment_1,restr='')
into = "INSERT INTO comment3(author,comment,`time`) VALUES (%s, %s, %s)"
values = (author, comment, time)
cur.execute(into, values)
cur.execute("INSERT INTO comment3(author,comment,`time`) VALUES ('%s', '%s', '%s')" % (author, comment, time))
conn.commit()
print comment_1
end = (author + '\n'+ comment + '\n' + time + '\n' + '\n').encode('utf-8')
f = open(title + '.txt','a')
f.write(end)
f.close()
if all_2==[]:
print '没有更多评论了...'
break
page += 1
get_title()
conn.close()