#! python3
# coding:utf-8
import urllib
from urllib import request
from urllib.request import urlopen
import json
import requests
from time import sleep
#爬取省名的header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
,'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'ASP.NET_SessionId=wrc0hppaej4hxoqkhtgng3ds; SC_ANALYTICS_GLOBAL_COOKIE=c14d814822b941e493eccd07f9b51a49|False; Hm_lvt_86fab548a9cd225ca6996b8f596fb011=1525699410; Hm_lpvt_86fab548a9cd225ca6996b8f596fb011=1525699410; nTalk_CACHE_DATA={uid:lb_1000_ISME9754_guest4B85A27C-78B2-2C,tid:1525699406108016}; NTKF_T2D_CLIENTID=guest4B85A27C-78B2-2C72-684D-3AC577F644C7; _ga=GA1.3.1823818086.1525699427; _gid=GA1.3.1721887409.1525699427; _gat=1'
}
#爬取地区和市县的header
area_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
,'Accept':'application/json, text/javascript, */*; q=0.01','Connection':'Keep-Alive','DNT':'1'
,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'PHPSESSID=9d47ec7b3084bb4dd988070d05dd1954; SERVERID=dee5d74d516b027440f035efd0ad1d40|1525866565|1525866537; _smt_uid=5af2e02d.f9f56a0'
}
#结果文件保存路径
directory = "C:\\Temp\\"
#省市县的链接
list_url = []
#b'\xe7\x9c\x81\xe5\x90\x8d\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\xba\x97\xe5\x90\x8d\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7'="省名\t城市\t店名\t电话\t地址\t授权号".encode()
table_head = b'\xe7\x9c\x81\xe5\x90\x8d\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\xba\x97\xe5\x90\x8d\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7'
#结果列表
content = [table_head.decode() + '\n']
#b'\xe5\x9c\xb0\xe5\x9d\x80\xe5\x85\xb3\xe9\x94\xae\xe5\xad\x97\xe6\x88\x96\xe9\x97\xa8\xe5\xba\x97\xe7\xbc\x96\xe5\x8f\xb7'="地址关键字或门店编号".encode()
search_blank = b'\xe5\x9c\xb0\xe5\x9d\x80\xe5\x85\xb3\xe9\x94\xae\xe5\xad\x97\xe6\x88\x96\xe9\x97\xa8\xe5\xba\x97\xe7\xbc\x96\xe5\x8f\xb7'
def getAllProvinces():
'''
获取所有的省市县信息
'''
#urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象。
opener = urllib.request.build_opener(urllib.request.HTTPHandler)
#安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络POST请求,获取省名
req = request.Request("http://www.huarun.com/index.php/huarun/Pro", headers = headers,)
#得到网络响应
response = urlopen(req)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
json_data = response.read().decode('utf-8')
data = json.loads(json_data)
list_province = data["msg"]
print(list_province)
for province_index in range(len(list_province)):
payload = {'pro': list_province[province_index]["pro"].encode()}
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络POST请求,获取城市名
r = requests.post("http://www.huarun.com/index.php/huarun/Area", data=payload,headers = area_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
json_data = r.text
data = json.loads(json_data)
list_area = data["msg"]
print(list_area)
for area_index in range(len(list_area)):
payload = {'city': list_area[area_index]["area"].encode()}
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络POST请求,获取地区/县名
r = requests.post("http://www.huarun.com/index.php/huarun/City", data=payload,headers = area_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
json_data = r.text
data = json.loads(json_data)
list_city = data["msg"]
print(list_city)
for city_index in range(len(list_city)):
list_url.append({"pro":list_province[province_index]["pro"].encode(),"city":list_city[city_index]["city"].encode(),"store":list_city[city_index]["city"].encode(),"search":search_blank})
for payload_index in range(len(list_url)):
payload = list_url[payload_index]
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络POST请求
r = requests.post("http://www.huarun.com/index.php/huarun/SearchStore", data=payload,headers = area_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
json_data = r.text
data = json.loads(json_data)
list_search = data["msg"]
print(list_search)
#[{'address': '安徽省安庆市光彩大市场三期天柱山路109号-110号', 'auth': '皖00065', 'phone':
#'18505226929', 'sname': '安庆光彩大市场', 'vip': '0'}, {'address':
#'安庆市光彩市场四期D区元山路6-8号 ', 'auth': '皖00022', 'phone': '0556-5510557',
#'sname': '光彩大市场', 'vip': '0'}]
for search_index in range(len(list_search)):
address = list_search[search_index]["address"].replace('\n','').replace('\r','')
auth = list_search[search_index]["auth"]
phone = list_search[search_index]["phone"].replace('\n','').replace('\r','')
sname = list_search[search_index]["sname"].replace('\n','').replace('\r','')
content.append('\t'.join((payload["pro"].decode() , payload["city"].decode() , sname , phone , address , auth)) + "\n")
def outputResultFile():
'''
输出为结果文件
'''
path_name = directory + '\\fendian_huarun.txt'
with open(path_name, 'w', encoding='utf-8') as file:
#循环遍历列表content
for index in range(len(content)):
file.write(content[index])
def start_crawl():
'''
爬虫主程序
'''
#获取所有的省市县信息
getAllProvinces()
#输出为结果文件
outputResultFile()
#完成
print('Complete!')
#爬虫主程序
start_crawl()