python# coding:utf-8
__author__ = 'BONFY CHEN'
import requests
import re
PROXIES = None
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'
, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
, 'Accept-Encoding': 'gzip,deflate,sdch'
, 'Accept-Language': 'zh-CN,zh;q=0.8'
}
BASE_FOLDER = 'D:/xxx_folder/'
class xcarDown(object):
_base_folder = None
_proxies = None
_headers = None
_website = 'http://newcar.xcar.com.cn'
_xcar_lst = []
def set_base_folder(self, base_folder):
self._base_folder = base_folder
def set_headers(self, headers):
self._headers = headers
def set_proxies(self, proxies):
self._proxies = proxies
def __init__(self, base_folder=BASE_FOLDER, proxies=PROXIES, headers=HEADERS):
self.set_base_folder(base_folder)
self.set_headers(headers)
self.set_proxies(proxies)
def download_image_from_url(self, url, name=None):
"""
download_image_from_url
:param url: the resource image url
:param name: he destination file name
:return:
"""
local_filename = name + '_' + url.split('/')[-1]
r = requests.get(url, proxies=self._proxies, headers=self._headers, stream=True)
with open(self._base_folder + local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
f.close()
return local_filename
def download_xcar(self, url):
"""
:param url: the source url in xcar.com.cn
http://newcar.xcar.com.cn/2674/2015/detail/1.htm
:return:
"""
r = requests.get(url, proxies=self._proxies, headers=self._headers)
# print r.encoding
r.encoding = 'gbk'
m1 = re.search(r"var nextUrl = '(?P.*.htm)'", r.text)
next_url = m1.groupdict()['n_url'] if m1 else None
m2 = re.search(r"
pic_url = m2.groupdict()['pic_url'] if m2 else None
m3 = re.search(r"
title = m3.groupdict()['title'] if m3 else ''
m4 = re.search(r"
cont = m4.groupdict()['cont'] if m4 else ''
m5 = re.search(r"
(?P.*)", r.text)model = m5.groupdict()['model'] if m5 else ''
if pic_url:
try:
self.download_image_from_url(pic_url, name='_'.join([model, title, cont]))
print 'download complete: pic from {} '.format(pic_url)
except IOError:
print 'file name IOERROR'
self.download_image_from_url(pic_url, name=model)
print 'download complete: pic from {} '.format(pic_url)
except Exception as e:
print e
dct = dict(pic_url=pic_url, next_url=next_url, title=title, cont=cont, model=model)
self._xcar_lst.append(dct)
if next_url[-4:] == '.htm':
self.download_xcar(self._website + next_url)
if __name__ == '__main__':
print("Welcome to the Pic Download for xcar.com")
print("Downloaded files in the folder: " + BASE_FOLDER )
print("---------------------------------------")
id_modell = int(input("Please enter the modell id(eg.2674): "))
year = int(input("Please enter the year (eg.2015): "))
url = 'http://newcar.xcar.com.cn/{}/{}/detail/1.htm'.format(id_modell, year)
xcar = xcarDown()
xcar.download_xcar(url)