客车图片爬虫

最新推荐文章于 2024-08-21 15:31:17 发布
Mz枫
最新推荐文章于 2024-08-21 15:31:17 发布
阅读量217
点赞数
分类专栏：深度学习爬虫 Python
本文链接：https://blog.csdn.net/u012456019/article/details/105216175
版权
深度学习同时被 3 个专栏收录
17 篇文章 0 订阅
订阅专栏
爬虫
2 篇文章 0 订阅
订阅专栏
Python
2 篇文章 0 订阅
订阅专栏
# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
# from getcookie import excuteScript
import time, random
import json
import os

requests.packages.urllib3.disable_warnings()

# str(content).encode('ISO-8859-1').decode('utf-8')
carbrandlist = ['宇通', '金龙', '黄海', '中通', '金旅', '少林', '海格', '安凯', '西沃', '江淮', '福田', '比亚迪', '东风', '申龙', ' 中大', '江铃', '女神',
                '大宇', '亚星', '恒通', '牡丹', '金华', '凌宇', '丰田', '丹东', '长安', '广通', '现代', '齐鲁', '京华']
image_num = 0
car_num = 0  # 当前是第几辆车
prepath = '/software/data/keche/'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}


# headers = {
#     'cache-control' :'private',
#     'content-length' :'93283',
#     'content-type' :'text/html; charset=utf-8',
#     'date' :'Mon, 30 Mar 2020 03:45:41 GMT',
#     'proxy-connection' :'Keep-Alive',
#     'server' :'Microsoft-IIS/7.5',
#     'via' :'proxy A',
#     'x-aspnet-version' :'4.0.30319',
#     'x-powered-by' :'ASP.NET'
# }


class KeCheCrawler():

    def __init__(self):

        # print(http)
        self.baseurl = 'http://www.cn2che.com/buycar/'
        self.sess = requests.Session()
        self.start_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p1c0r0m0i0o0o2'

    # def anti_value(self):
    #     '''
    #     获取antipas参数需要的key和value
    #     :return:
    #     '''
    #     content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
    #     params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]
    #     return params

    # def caculate_antipas(self):
    #     '''
    #     计算antipas参数
    #     :return:
    #     '''
    #     params = self.anti_value()
    #     antipas = excuteScript(params[0], params[1])
    #     self.sess.cookies.set('antipas', antipas)

    def page_url(self):
        # self.caculate_antipas()
        '''
        获取翻页链接
        :param start_url:
        :return:
        '''
        content = pq(self.sess.get(self.start_url, verify=False).text)
        # print("！！！content is ", content)
        totalpagestring = content(
            'div[@id="container"] div[@class="whiteBg"] div[@class="w"] span[@class="Total"]').text()
        pattern = r'共(.*?)页'
        totalpages = re.findall(pattern, totalpagestring)[0]
        print("total page is ", totalpages)
        # page_num_max = max([int(each.text()) for each in content(
        #     'div[@class="page-center search_list_one"] ul[@class="pagination"]  > li > a').items() if
        #                     re.match(r'\d+', each.text())])
        page_url_list = []
        for i in range(20, int(totalpages) + 1, 1):
            base_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p{}'.format(i) + 'c0r0m0i0o0o2'
            # print("第 %d 页", i)
            # print(base_url)
            page_url_list.append(base_url)

        return page_url_list

    def index_page(self, start_url):
        '''
        抓取详情页链接
        :param start_url:
        :return:
        '''
        # print(start_url)
        content = pq(self.sess.get(start_url).text)
        # print('$' * 200)
        # print(content)
        for each in content('p[@class="carBT"]  >a').items():
            # print("￥￥￥￥￥￥￥￥￥each is ",each)
            url = each.attr.href
            # print("￥￥￥￥￥￥￥￥￥url is ",url)
            if not url.startswith('http'):
                url = self.baseurl + url
            yield url

    def detail_page(self, detail_url):
        '''
        抓取详情信息
        :param detail_url:
        :return:
        '''

        content = pq(self.sess.get(detail_url).text, parser="html")
        # print(self.sess.get(detail_url).text)
        # print("^^^^^^^^^",content)
        img = str(content('img'))
        # print("content is ", img)
        pattern = r'img src=["](.*?)["] onerror'
        result = re.findall(pattern, img)
        detail=content('div[@class="leftmain"] div[@class="Detailed"] dl>dd>ol').text().strip().split('：')
        name=content('h1[@id="title"]').text()
        brandname=detail[4].split('\n')[0]
        for brand in carbrandlist:
            carbrand = brand
            if name.find(brand) != -1:
                break
                # content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
            carbrand = name
        data_dict = {
            'name': name,
            'carbrand': detail[4].split('\n')[0],
            'bordingdate': detail[6].split('\n')[0],
            'km': detail[7].split('\n')[0],
            'price': detail[1].split('\n')[0],
            'No': detail[2].split('\n')[0],
            'image': result
        }
        if not data_dict['name']:
            print(str(content).encode('ISO-8859-1').decode('utf-8'))

        return data_dict, result

    def request_download(self, https, carbrand):
        global car_num

        global image_num
        print("http is ",https)
        r = requests.get(https, verify=False)
        with open(
                prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(
                    image_num) + '.png', 'wb') as f:
            f.write(r.content)
            image_num = image_num + 1

    def run(self):
        global car_num
        for pageurl in self.page_url():
            # print(pageurl)
            for detail_url in self.index_page(pageurl):
                # print("datail is ", detail_url)
                listout, result = self.detail_page(detail_url)
                data_string = json.dumps(listout, ensure_ascii=False)
                carbrand = listout['carbrand']
                filename = carbrand + str(car_num)
                isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')

                # 判断结果
                if not isExists:
                    # 如果不存在则创建目录
                    # 创建目录操作函数
                    os.makedirs(prepath + carbrand + '/' + filename + '/')
                file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')
                file.write(data_string)
                file.close()
                car_num = car_num + 1
                # print("list is ", listout)
                stop = 0
                for https in result:
                    if stop == 7:
                        break
                    self.request_download(https, carbrand)
                    stop = stop + 1
                print("暂停5-15秒,防止被关小黑屋")
                time.sleep(random.randint(5, 15))
            print('*' * 200)


if __name__ == '__main__':
    kccrawler = KeCheCrawler()
    kccrawler.run()