爬取好买 20180516

版权声明: https://blog.csdn.net/u011391734/article/details/80268753
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
import pymysql


class HowbuySpider(scrapy.Spider):
    name = 'howbuy'
    # 爬取网址,只适于不需要登录的请求,因为没法设置cookie等信息
    start_urls = ['https://simu.howbuy.com/company/']

    # 设置浏览器用户代理
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'}

    json_data = {}
    cookies = {

        '__hutma': '268394641.1477849753.1525747537.1525747537.1525752023.2',
        '__hutmb': '268394641.6.10.1525752023',
        '__hutmc': '268394641',
        '__hutmmobile': '5107ED26-8437-4862-AA6E-BC8903F8D62B',
        '__hutmz': '268394641.1525747537.1.1.hutmcsr=(direct)|hutmccn=(direct)|hutmcmd=(none)',
        '_ga': 'GA1.2.1895642071.1525747598',
        '_gid': 'GA1.2.2088878427.1525747598',
        '_hb_pgid': '9200539722',
        'FUNDID_COOKIE': 'COOKIE20180508000000241112',
        'Hm_lpvt_394e04be1e3004f9ae789345b827e8e2': '1525752023',
        'Hm_lpvt_f737b389ea57a0a21e1ff802f849fbf0': '1525752215',
        'Hm_lvt_394e04be1e3004f9ae789345b827e8e2': '1525747538',
        'Hm_lvt_f737b389ea57a0a21e1ff802f849fbf0': '1525746115,1525747545',
        'OZ_1K_1497': 'etime=1525752022&ozu_sid=-&ozs=631830&flag=2&compid=1497',
        'OZ_1S_1497': 'etime=1525752022&ozu_sid=-&ozs=631830&flag=2&compid=1497',
        'OZ_1U_1497': 'vid=vaf10f51d0ac57.0&ctime=1525752214&ltime=1525752154',
        'OZ_1Y_1497': 'erefer=-&eurl=https%3A//www.howbuy.com/%3Fozs%3D631830-1497&etime=1525752022&ctime=1525752214&ltime=1525752154&compid=1497',
        'OZ_SI_1497': 'sTime=1525747537&sIndex=130',
        'SESSION': '519982ff-c227-49fb-b2db-1d001806e5c4',
        'simu_qualified_v2': '4',
        'USER_INFO_COOKIE': '8009617213',
        'USER_SALT_COOKIE': '6cb573c2f817f868eed57dfbf84ca7e6',



    }

    def start_requests(self):
        db = pymysql.connect(
            host="192.168.0.21",
            user="public",
            passwd="123456",
            db="howbuy",
            port=3321,
            charset="utf8")
        cur = db.cursor()
        sql_01 = "TRUNCATE TABLE company_infor"
        sql_02 = "TRUNCATE TABLE company_infor"
        sql_03 = "TRUNCATE TABLE product_list"
        cur.execute(sql_01)
        cur.execute(sql_02)
        cur.execute(sql_03)
        db.commit()
        db.close()

        for url in self.start_urls:
            for x_page in range(1, 751):
                json_datas = {
                    'allPage'   : '750',
                    'orderType': 'Desc',
                    'page': str(x_page),
                    'perPage': '20',
                    'sortField  ': 'hb1nscclzyjj',
                }
                self.json_data = json_datas
                yield scrapy.FormRequest(url, cookies=self.cookies, callback=self.parse, formdata=self.json_data)

    def parse(self, response):

        db = pymysql.connect(
            host="192.168.0.21",
            user="public",
            passwd="123456",
            db="howbuy",
            port=3321,
            charset="utf8")
        cur = db.cursor()

        papers = response.xpath(
            r'//div[@class="fund_list"]/table/tbody/tr')

        for paper in papers:
            company_name = paper.xpath(
                r'td[2]/a[@target="_blank"]/text()').extract()[0]
            company_address = paper.xpath(r'td[3]/text()').extract()[0]
            create_time = paper.xpath(r'td[4]/text()').extract()[0]
            fund_amount = paper.xpath(r'td[6]//td[1]/a/text()').extract()[0]
            stand_fund = paper.xpath(r'td[6]//td[2]/a/text()').extract()[0]
            rate_of_return = paper.xpath(
                r'td[6]//td[3]/span/text()').extract()[0]
            company_link = paper.xpath(
                'td[2]/a[@target="_blank"]/@href').extract()[0]

            sql = r'insert into company_infor values("%s","%s","%s","%s","%s","%s","%s")' % (
                company_name, company_address, create_time, fund_amount, stand_fund, rate_of_return, company_link
            )
            cur.execute(sql)
            db.commit()

            yield scrapy.FormRequest(url=company_link, cookies=self.cookies, formdata=self.json_data, callback=self.product_list)

        db.close()

    def product_list(self, response):
        db = pymysql.connect(
            host="192.168.0.21",
            user="public",
            passwd="123456",
            db="howbuy",
            port=3321,
            charset="utf8")
        cur = db.cursor()
        papers = response.xpath(r"//div[@class= 'fund_intro']/div//ul")
        company_name = response.xpath(
            r"//div[@class='con_left fl']/h2/text()").extract()[0]
        for paper in papers:
            product_number = paper.xpath(r'li[1]/text()').extract()[0]
            product_name = paper.xpath(r'li[2]/a/text()').extract()[0]
            product_type = paper.xpath(r'li[3]/text()').extract()[0]
            rate_of_return_this_year = paper.xpath(
                r'li[7]//text()').extract()[0]
            create_rate = paper.xpath(r'li[8]//text()').extract()[0]
            net_asset_value = paper.xpath(r'li[5]/text()').extract()[0]
            create_time = paper.xpath(r'li[6]/text()').extract()[0]
            history_link = paper.xpath(r'li[2]/a/@href').extract()[0] + "lsjz"

            sql = r'insert into product_list values("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (company_name, product_number,
                                                                                                      product_name, product_type, rate_of_return_this_year, create_rate, net_asset_value, create_time, history_link
                                                                                                      )
            cur.execute(sql)
            db.commit()
            yield scrapy.FormRequest(url=history_link, cookies=self.cookies, callback=self.history_values)
        db.close()

    def history_values(self, response):

        db = pymysql.connect(
            host="192.168.0.21",
            user="public",
            passwd="123456",
            db="howbuy",
            port=3321,
            charset="utf8")
        cur = db.cursor()
        papers = response.xpath(
            r"//div[@class='fund_data']//tr[position()>1]")
        company_name = response.xpath(
            r"//span[@class='cBlue']/a/text()").extract()[0]
        for paper in papers:
            date = paper.xpath(r'td[1]/text()').extract()[0]
            net_values = paper.xpath(r'td[2]/text()').extract()[0]
            all_values = paper.xpath(r'td[3]/text()').extract()[0]
            rise_and_fall = paper.xpath(r'td[4]//text()').extract()[0]

            product_name = paper.xpath(
                r'//div[@class="trade_fund_title clearfix"]/h1/text()').extract()[0]
            sql = r'insert into history_value values("%s","%s","%s","%s","%s","%s")' % (
                company_name, product_name, date, net_values, all_values, rise_and_fall)

            cur.execute(sql)
            db.commit()

        db.close()
阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页