客车图片爬虫

2 篇文章 0 订阅
2 篇文章 0 订阅
# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
# from getcookie import excuteScript
import time, random
import json
import os

requests.packages.urllib3.disable_warnings()

# str(content).encode('ISO-8859-1').decode('utf-8')
carbrandlist = ['宇通', '金龙', '黄海', '中通', '金旅', '少林', '海格', '安凯', '西沃', '江淮', '福田', '比亚迪', '东风', '申龙', ' 中大', '江铃', '女神',
                '大宇', '亚星', '恒通', '牡丹', '金华', '凌宇', '丰田', '丹东', '长安', '广通', '现代', '齐鲁', '京华']
image_num = 0
car_num = 0  # 当前是第几辆车
prepath = '/software/data/keche/'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}


# headers = {
#     'cache-control' :'private',
#     'content-length' :'93283',
#     'content-type' :'text/html; charset=utf-8',
#     'date' :'Mon, 30 Mar 2020 03:45:41 GMT',
#     'proxy-connection' :'Keep-Alive',
#     'server' :'Microsoft-IIS/7.5',
#     'via' :'proxy A',
#     'x-aspnet-version' :'4.0.30319',
#     'x-powered-by' :'ASP.NET'
# }


class KeCheCrawler():

    def __init__(self):

        # print(http)
        self.baseurl = 'http://www.cn2che.com/buycar/'
        self.sess = requests.Session()
        self.start_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p1c0r0m0i0o0o2'

    # def anti_value(self):
    #     '''
    #     获取antipas参数需要的key和value
    #     :return:
    #     '''
    #     content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
    #     params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]
    #     return params

    # def caculate_antipas(self):
    #     '''
    #     计算antipas参数
    #     :return:
    #     '''
    #     params = self.anti_value()
    #     antipas = excuteScript(params[0], params[1])
    #     self.sess.cookies.set('antipas', antipas)

    def page_url(self):
        # self.caculate_antipas()
        '''
        获取翻页链接
        :param start_url:
        :return:
        '''
        content = pq(self.sess.get(self.start_url, verify=False).text)
        # print("!!!content is ", content)
        totalpagestring = content(
            'div[@id="container"] div[@class="whiteBg"] div[@class="w"] span[@class="Total"]').text()
        pattern = r'共(.*?)页'
        totalpages = re.findall(pattern, totalpagestring)[0]
        print("total page is ", totalpages)
        # page_num_max = max([int(each.text()) for each in content(
        #     'div[@class="page-center search_list_one"] ul[@class="pagination"]  > li > a').items() if
        #                     re.match(r'\d+', each.text())])
        page_url_list = []
        for i in range(20, int(totalpages) + 1, 1):
            base_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p{}'.format(i) + 'c0r0m0i0o0o2'
            # print("第 %d 页", i)
            # print(base_url)
            page_url_list.append(base_url)

        return page_url_list

    def index_page(self, start_url):
        '''
        抓取详情页链接
        :param start_url:
        :return:
        '''
        # print(start_url)
        content = pq(self.sess.get(start_url).text)
        # print('$' * 200)
        # print(content)
        for each in content('p[@class="carBT"]  >a').items():
            # print("¥¥¥¥¥¥¥¥¥each is ",each)
            url = each.attr.href
            # print("¥¥¥¥¥¥¥¥¥url is ",url)
            if not url.startswith('http'):
                url = self.baseurl + url
            yield url

    def detail_page(self, detail_url):
        '''
        抓取详情信息
        :param detail_url:
        :return:
        '''

        content = pq(self.sess.get(detail_url).text, parser="html")
        # print(self.sess.get(detail_url).text)
        # print("^^^^^^^^^",content)
        img = str(content('img'))
        # print("content is ", img)
        pattern = r'img src=["](.*?)["] onerror'
        result = re.findall(pattern, img)
        detail=content('div[@class="leftmain"] div[@class="Detailed"] dl>dd>ol').text().strip().split(':')
        name=content('h1[@id="title"]').text()
        brandname=detail[4].split('\n')[0]
        for brand in carbrandlist:
            carbrand = brand
            if name.find(brand) != -1:
                break
                # content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
            carbrand = name
        data_dict = {
            'name': name,
            'carbrand': detail[4].split('\n')[0],
            'bordingdate': detail[6].split('\n')[0],
            'km': detail[7].split('\n')[0],
            'price': detail[1].split('\n')[0],
            'No': detail[2].split('\n')[0],
            'image': result
        }
        if not data_dict['name']:
            print(str(content).encode('ISO-8859-1').decode('utf-8'))

        return data_dict, result

    def request_download(self, https, carbrand):
        global car_num

        global image_num
        print("http is ",https)
        r = requests.get(https, verify=False)
        with open(
                prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(
                    image_num) + '.png', 'wb') as f:
            f.write(r.content)
            image_num = image_num + 1

    def run(self):
        global car_num
        for pageurl in self.page_url():
            # print(pageurl)
            for detail_url in self.index_page(pageurl):
                # print("datail is ", detail_url)
                listout, result = self.detail_page(detail_url)
                data_string = json.dumps(listout, ensure_ascii=False)
                carbrand = listout['carbrand']
                filename = carbrand + str(car_num)
                isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')

                # 判断结果
                if not isExists:
                    # 如果不存在则创建目录
                    # 创建目录操作函数
                    os.makedirs(prepath + carbrand + '/' + filename + '/')
                file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')
                file.write(data_string)
                file.close()
                car_num = car_num + 1
                # print("list is ", listout)
                stop = 0
                for https in result:
                    if stop == 7:
                        break
                    self.request_download(https, carbrand)
                    stop = stop + 1
                print("暂停5-15秒,防止被关小黑屋")
                time.sleep(random.randint(5, 15))
            print('*' * 200)


if __name__ == '__main__':
    kccrawler = KeCheCrawler()
    kccrawler.run()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值