Pyspider框架之大众点评数据抓取

需求

抓取全国所有城市,美食的店铺信息。

代码

没有IP代理,勿用

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-29 14:37:07
# Project: dianping_vi

from pyspider.libs.base_handler import *
import datetime
import re
import json
import copy

from pymongo import MongoClient

# 连接线下数据库
DB_IP = 
DB_PORT = 

#DB_IP = '127.0.0.1'
#DB_PORT = 27017

client = MongoClient(host=DB_IP, port=DB_PORT)

# admin 数据库有帐号,连接-认证-切换
db_auth = client.admin
db_auth.authenticate( )

DB_NAME = 'research'
DB_COL = 'dianping'
db = client[DB_NAME]
col = db[DB_COL]



detail_headers = {
    
    'Host': 'www.dianping.com',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Cookie': 'hc.v=c433e5ea-ff94-9d82-2544-871b013c64eb.1536116212; _lxsdk_cuid=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk_s=165a8c7b269-b24-ec-e63%7C%7C135',
    #'Cookie': '_lxsdk_cuid=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _lxsdk=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _hc.v=b53c090b-d406-9c02-4cf2-ef330bf04f87.1534404033; switchcityflashtoast=1; source=m_browser_test_33; pvhistory="6L+U5ZuePjo8L3N1Z2dlc3QvZ2V0SnNvbkRhdGE/Y2FsbGJhY2s9anNvbnBfMTUzNDQwNDI0NjYxOV82NTg3NT46PDE1MzQ0MDQyNDY2NzddX1s="; m_flash2=1; default_ab=citylist%3AA%3A1%7Cshop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=%7C%7C0',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
     'Accept-Encoding': 'gzip, deflate',
     'Accept-Language': 'en-US,en;q=0.9',
     'Cache-Control': 'no-cache',
     'Upgrade-Insecure-Requests': '1',
     'Connection': 'keep-alive',
     'Pragma': 'no-cache',
}

def parse_score(taste, doc):
    if len(taste) == 2:
        taste_score = [num_map.get(x, '0') for x in taste]
        taste = float('.'.join(taste_score))
    elif len(taste) == 1:
        taste_score = num_map.get(taste[0], '0')
        _flag = doc.xpath('//span[@id="comment_score"]/span[1]/text()')[1]
        if _flag.startswith('1'):
            taste = float('1.' + taste_score)
        else:
            taste = float(taste_score + '1.')
    else:
        taste = 0
    return taste



def get_today():
    return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'), '%Y-%m-%d')


class Handler(BaseHandler):
    crawl_config = {
    
        'proxy': '',
        "headers": {
    
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
        },
        'retries': 6
        
    }

    @every(minutes=24 * 60)
    def 
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值