需求
网站入口:www.tripadvisor.com
网页下端,遍历点开进入所有城市链接:
点击后进入该城市的所有hotel
代码
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-06 11:16:59
# Project: trip_hotel
from pyspider.libs.base_handler import *
import datetime
import re
import json
import copy
from pymongo import MongoClient
# 连接线下数据库
DB_IP = ''
DB_PORT =
#DB_IP = '127.0.0.1'
#DB_PORT = 27017
client = MongoClient(host=DB_IP, port=DB_PORT)
# admin 数据库有帐号,连接-认证-切换
db_auth = client.admin
db_auth.authenticate("", "")
DB_NAME = 'research'
db = client[DB_NAME]
def get_today():
return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'), '%Y-%m-%d')
class Handler(BaseHandler):
crawl_config = {
'headers': {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'cookie':'SetCurrency=USD'},
'proxy': 'http://10.15.100.94:6666',
'retries': 5
}
url = 'https://www.tripadvisor.com/'
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.url, callback=self.index_page)
@config(age=60)
def index_page(self, response):
page = response.etree
city_list = page.xpath("//div[@class='customSelection']/div[@class='boxhp collapsibleLists']/div[@class='section']/div[@class='ui_columns' or @class='ui_columns no-collapse']/ul[@class='lst ui_column is-4']/li[@class='item']")
print(len(city_list))
base_url = 'https://www.tripadvisor.com'
for each in city_list:
city_name = each.xpath("./a/text()")[0]
city_link = base_url + each.xpa