scrapy google爬虫实例

#!/usr/bin/python
# -*- coding:utf-8 -*-
import MySQLdb
import re
import sys
import json
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import log
from scrapy import Request
from goog.items import GoogItem

class googSpider(Spider):
    name = "goog"
    allowed_domains = ["google.co.jp"]
    start_urls = []

    def start_requests(self):
        try:
            conn = MySQLdb.connect('127.0.0.1','root','123456','test')
            cursor = conn.cursor()
            res = cursor.execute('select * from birds_google_suggest')
            brand = []
            crawlURL_advice = []
            crawlURL_bottom = []
            brand.append(1)
            res = cursor.fetchall()
            # for i in res:
            #     print i[1]+'\n'
            # exit()
            for i in res:
                brand.append(i[1])
                url = r"https://www.google.com/complete/search?sclient=psy-ab&biw=1845&bih=407&q="+i[1]+r"%20coupon"
                crawlURL_advice.append(url)
                request = Request(url,callback=self.parse,meta = {'keyword':i[1],'level':'0'})
                yield request

                # url_for_bottom = "https://www.google.com/search?q="+i[1]+"+coupon&fp=1&biw=1855&bih=428&dpr=1&tch=1&ech=1&psi=pP2GV4z1GMfAjwOl3avgBQ.1468464549215.3"
                # url_for_bottom = url_for_bottom.replace(' ','+')
                # print url_for_bottom+'\n'
                #
                # request = Request(url_for_bottom,callback=self.parse_bottom,dont_filter=True,meta={'keyword':i[1]})
                # yield request
            cursor.close()
            conn.close()
        except MySQLdb.Error,e:
            print "Mysql Error %d %s" % (e.args[0],e.args[1])

    def parse(self,response):
        brandToken = ''
        lst = json.loads(response.body)
        for v in lst[1]:
            brandToken += re.sub(r'[<b>|</b>]','',v[0])+','
        conn = MySQLdb.connect('127.0.0.1','root','123456','test')
        cursor = conn.cursor()
        #res = cursor.execute("insert into `tb_brandToken` values('','"+lst[0]+"','"+brandToken+"')")
        conn.commit()

    def parse_bottom(self,response):
        conn = MySQLdb.connect('127.0.0.1','root','123456','test')
        cursor = conn.cursor()
        keyword = response.meta['keyword']
        content = re.search('\:\[\\\\\"(.*?)\\\\\"\]\,',response.body).group(1)
        content = re.sub('[\'|\\\\\"\,\\\\\"]+',',',content)
        if(content):
            res = cursor.execute("insert into `tb_brandBotToken` values('','"+keyword+"','"+content+"')")
            conn.commit()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值