# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
# from getcookie import excuteScript
import time, random
import json
import os
requests.packages.urllib3.disable_warnings()
# str(content).encode('ISO-8859-1').decode('utf-8')
carbrandlist = ['宇通', '金龙', '黄海', '中通', '金旅', '少林', '海格', '安凯', '西沃', '江淮', '福田', '比亚迪', '东风', '申龙', ' 中大', '江铃', '女神',
'大宇', '亚星', '恒通', '牡丹', '金华', '凌宇', '丰田', '丹东', '长安', '广通', '现代', '齐鲁', '京华']
image_num = 0
car_num = 0 # 当前是第几辆车
prepath = '/software/data/keche/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
# headers = {
# 'cache-control' :'private',
# 'content-length' :'93283',
# 'content-type' :'text/html; charset=utf-8',
# 'date' :'Mon, 30 Mar 2020 03:45:41 GMT',
# 'proxy-connection' :'Keep-Alive',
# 'server' :'Microsoft-IIS/7.5',
# 'via' :'proxy A',
# 'x-aspnet-version' :'4.0.30319',
# 'x-powered-by' :'ASP.NET'
# }
class KeCheCrawler():
def __init__(self):
# print(http)
self.baseurl = 'http://www.cn2che.com/buycar/'
self.sess = requests.Session()
self.start_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p1c0r0m0i0o0o2'
# def anti_value(self):
# '''
# 获取antipas参数需要的key和value
# :return:
# '''
# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
# params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]
# return params
# def caculate_antipas(self):
# '''
# 计算antipas参数
# :return:
# '''
# params = self.anti_value()
# antipas = excuteScript(params[0], params[1])
# self.sess.cookies.set('antipas', antipas)
def page_url(self):
# self.caculate_antipas()
'''
获取翻页链接
:param start_url:
:return:
'''
content = pq(self.sess.get(self.start_url, verify=False).text)
# print("!!!content is ", content)
totalpagestring = content(
'div[@id="container"] div[@class="whiteBg"] div[@class="w"] span[@class="Total"]').text()
pattern = r'共(.*?)页'
totalpages = re.findall(pattern, totalpagestring)[0]
print("total page is ", totalpages)
# page_num_max = max([int(each.text()) for each in content(
# 'div[@class="page-center search_list_one"] ul[@class="pagination"] > li > a').items() if
# re.match(r'\d+', each.text())])
page_url_list = []
for i in range(20, int(totalpages) + 1, 1):
base_url = 'http://www.cn2che.com/buycar/c7b0c0s0p0c0m0p{}'.format(i) + 'c0r0m0i0o0o2'
# print("第 %d 页", i)
# print(base_url)
page_url_list.append(base_url)
return page_url_list
def index_page(self, start_url):
'''
抓取详情页链接
:param start_url:
:return:
'''
# print(start_url)
content = pq(self.sess.get(start_url).text)
# print('$' * 200)
# print(content)
for each in content('p[@class="carBT"] >a').items():
# print("¥¥¥¥¥¥¥¥¥each is ",each)
url = each.attr.href
# print("¥¥¥¥¥¥¥¥¥url is ",url)
if not url.startswith('http'):
url = self.baseurl + url
yield url
def detail_page(self, detail_url):
'''
抓取详情信息
:param detail_url:
:return:
'''
content = pq(self.sess.get(detail_url).text, parser="html")
# print(self.sess.get(detail_url).text)
# print("^^^^^^^^^",content)
img = str(content('img'))
# print("content is ", img)
pattern = r'img src=["](.*?)["] onerror'
result = re.findall(pattern, img)
detail=content('div[@class="leftmain"] div[@class="Detailed"] dl>dd>ol').text().strip().split(':')
name=content('h1[@id="title"]').text()
brandname=detail[4].split('\n')[0]
for brand in carbrandlist:
carbrand = brand
if name.find(brand) != -1:
break
# content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
carbrand = name
data_dict = {
'name': name,
'carbrand': detail[4].split('\n')[0],
'bordingdate': detail[6].split('\n')[0],
'km': detail[7].split('\n')[0],
'price': detail[1].split('\n')[0],
'No': detail[2].split('\n')[0],
'image': result
}
if not data_dict['name']:
print(str(content).encode('ISO-8859-1').decode('utf-8'))
return data_dict, result
def request_download(self, https, carbrand):
global car_num
global image_num
print("http is ",https)
r = requests.get(https, verify=False)
with open(
prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(
image_num) + '.png', 'wb') as f:
f.write(r.content)
image_num = image_num + 1
def run(self):
global car_num
for pageurl in self.page_url():
# print(pageurl)
for detail_url in self.index_page(pageurl):
# print("datail is ", detail_url)
listout, result = self.detail_page(detail_url)
data_string = json.dumps(listout, ensure_ascii=False)
carbrand = listout['carbrand']
filename = carbrand + str(car_num)
isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(prepath + carbrand + '/' + filename + '/')
file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')
file.write(data_string)
file.close()
car_num = car_num + 1
# print("list is ", listout)
stop = 0
for https in result:
if stop == 7:
break
self.request_download(https, carbrand)
stop = stop + 1
print("暂停5-15秒,防止被关小黑屋")
time.sleep(random.randint(5, 15))
print('*' * 200)
if __name__ == '__main__':
kccrawler = KeCheCrawler()
kccrawler.run()
客车图片爬虫
最新推荐文章于 2024-08-21 15:31:17 发布