使用scrapy框架爬取安居客
import requests
from scrapy.contrib.spiders import CrawlSpider
import re
from bs4 import BeautifulSoup
import json
import urllib.request
from ..items import YilongItem
class YilongSpider(CrawlSpider):
name = "yilong"
allowed_domains = ["yilong.org"]
start_urls = ["http://hotel.elong.com/beijing/"]
def parse(self,response):
def remove_emoji(comment, restr=''):
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, comment)
url = 'http://hotel.elong.com/ajax/tmapilist/asyncsearch'
header = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '1641',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Referer': 'http://hotel.elong.com/beijing/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'X-Requested-With': 'XMLHttpRequest',
}
for i in range(1, 10):
data = {'code': '-99',
'listRequest.pageIndex': i,
'listRequest.pageSize': '20'}
html = requests.post(url, data=data, headers=header)
text = html.json()['value']['hotelListHtml']
soup = BeautifulSoup(text, 'html.parser')
name = soup.find_all('img')
for a in name:
page = 1
while 1:
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'CookieGuid=e4473769-6b67-459a-9230-ee68a9cf44ee; s_eVar44=ppzqbaidu; _fid=e4473769-6b67-459a-9230-ee68a9cf44ee; ShHotel=CityID=0101&CityNameCN=%E5%8C%97%E4%BA%AC%E5%B8%82&CityName=%E5%8C%97%E4%BA%AC%E5%B8%82&OutDate=2018-11-06&CityNameEN=beijing&InDate=2018-11-05; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%E5%B8%82%23beijing%23; SHBrowseHotel=cn=30101020%2C%2C%2C%2C%2C%2C%3B90549076%2C%2C%2C%2C%2C%2C%3B90922044%2C%2C%2C%2C%2C%2C%3B50101599%2C%2C%2C%2C%2C%2C%3B91751292%2C%2C%2C%2C%2C%2C%3B&; newjava2=1aa848d2c49dfc6719f2c5e03cd1e8b3; JSESSIONID=6146394FA28F1210B2A4670108FC74CD; SessionGuid=2fb539a2-31fd-470b-9d64-c1ffcac1322b; Esid=98039b1b-ad09-47e5-a297-88b1c00f5eae; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=5&Isusefparam=0&Pkid=50793&Parentid=3150&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252F30101020%25252F%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; anti_token=C8C80446-4668-43E3-8EF5-FE55922A8718; __tccgd=0.0; __tctmc=0.248740382; __tctmd=0.207648715; semid=ppzqbaidu; outerFrom=ppzqbaidu; s_visit=1; __tctmb=0.87347167989023.1541417759666.1541417759666.1',
'Host': 'hotel.elong.com',
'Referer': 'http://hotel.elong.com/beijing/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'X-Requested-With': 'XMLHttpRequest',
}
url1 = 'http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=' + str(a['data-mark'].strip('img_')) + '&recommendedType=0&pageIndex=' + str(page) + '&mainTagId=0&subTagId=0&rankType=0&eToken=e4473769-6b67-459a-9230-ee68a9cf44ee&code=9375392&_=1540819416448'
request = urllib.request.Request(url1, headers=headers)
response1 = urllib.request.urlopen(request)
content = response1.read().decode('utf-8', 'ignore')
js = json.loads(content)
page += 1
if js['value']['Comments'] == []:
break
text1 = js['value']['Comments']
for j in range(len(text1)):
item = YilongItem()
a1 = js['value']['Comments'][j]['CommentUser']['NickName']
a2 = js['value']['Comments'][j]['Content']
a3 = js['value']['Comments'][j]['CreateTime']
a2 = remove_emoji(a2, restr='')
a1 = remove_emoji(a1, restr='')
item['name'] = a['alt']
item['user_name'] = a1
item['content'] = a2
item['creattime'] = a3
print(item['user_name'])
yield item