#! python3
# coding:utf-8
import urllib
import json
from urllib.parse import quote
import requests
from time import sleep
#爬取Ajax Handler的header
main_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
,'DNT':'1'
,'Accept-Encoding': 'gzip, deflate'
,'Accept':'application/json, text/javascript, */*; q=0.01','X-Requested-With': 'XMLHttpRequest'
,'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'uid=CvwBTFrzEOOuGgVmEMKMAg==; has_js=1; __clickidc=152587901641657657; _ga=GA1.3.1733272225.1525879020; _gid=GA1.3.45889085.1525879020; _gat_UA-69298406-2=1; a=xpQGf0ZMu925; Drupal.session_cache.sid=yEU9GqkgJAl7'
}
#结果文件保存路径
directory = "C:\\Temp\\"
#省市县的链接
list_url = []
#b'\xe7\x9c\x81\xe5\xb8\x82/\xe5\x8c\xba\xe5\x9f\x9f\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\x95\x86\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe9\x82\xae\xe7\xbc\x96\t\xe7\xbb\x8f\xe7\xba\xac\xe5\xba\xa6\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7\t\xe9\x93\xbe\xe6\x8e\xa5'="省市/区域\t城市\t商铺名称\t地址\t电话\t邮编\t经纬度\t授权号\t链接".encode()
table_head = b'\xe7\x9c\x81\xe5\xb8\x82/\xe5\x8c\xba\xe5\x9f\x9f\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\x95\x86\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe9\x82\xae\xe7\xbc\x96\t\xe7\xbb\x8f\xe7\xba\xac\xe5\xba\xa6\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7\t\xe9\x93\xbe\xe6\x8e\xa5'
#结果列表
content = [table_head.decode() + '\n']
#b'\xe4\xb8\xad\xe5\x9b\xbd'="中国".encode()
china = b'\xe4\xb8\xad\xe5\x9b\xbd'
def getAllProvinces():
'''
获取所有的省市县信息
'''
#urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象。
opener = urllib.request.build_opener(urllib.request.HTTPHandler)
#安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
#分页数
pagenum = 0
#HTTP POST请求的数据
payload = {'address':quote(china.decode()),'attributes':'','language':'zh','pagenum':pagenum,'curlat':39.904989,'curlng':116.405285}
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络请求
r = requests.post("https://www.dulux.com.cn/find/store-ajax", data=payload,headers = main_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
#得到网站响应并返回JSON字符串
json_data = r.text
#解析JSON字符串为字典对象
data = json.loads(json_data)
#分页数加1
pagenum+=1
while len(data) > 0:
#HTTP POST请求的数据
payload = {'address':quote(china.decode()),'attributes':'','language':'zh','pagenum':pagenum,'curlat':39.904989,'curlng':116.405285}
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络请求
r = requests.post("https://www.dulux.com.cn/find/store-ajax", data=payload,headers = main_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
#得到网站响应并返回JSON字符串
json_data = r.text
#解析JSON字符串为字典对象
data = json.loads(json_data)
for data_index in range(len(data)):
item_data = data[data_index]
region = item_data['region']
city = item_data['city']
companyname = item_data['companyname']
address = item_data['address']
phone = item_data['phone']
postalcode = item_data['postalcode']
lat = item_data['lat']
lon = item_data['lon']
subtitle = item_data['subtitle'] #授权号
websiteurl = item_data['websiteurl']
#storetext=item_data['storetext']
#attributecodes=item_data['attributecodes']
content.append('\t'.join((region , city , companyname , address , phone , postalcode , lon + ',' + lat , subtitle , websiteurl)) + '\n')
print(pagenum)
#分页数加1
pagenum+=1
def outputResultFile():
'''
输出为结果文件
'''
path_name = directory + '\\fendian_duoleshi.txt'
with open(path_name, 'w', encoding='utf-8') as file:
#循环遍历列表content
for index in range(len(content)):
file.write(content[index])
def start_crawl():
'''
爬虫主程序
'''
#获取所有的省市县信息
getAllProvinces()
#输出为结果文件
outputResultFile()
#完成
print('Complete!')
#爬虫主程序
start_crawl()