#! python3
# coding:utf-8
import urllib
import requests
from bs4 import BeautifulSoup
from time import sleep
#爬取搜索结果页面的header
result_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'ASP.NET_SessionId=mwwuspudqdxsm354g5xsqugd; CNZZDATA5617436=cnzz_eid%3D1133206276-1525702343-%26ntime%3D1525702343; Hm_lvt_f0a8192e2505f1dedce950f8913237d6=1525702344; Hm_lpvt_f0a8192e2505f1dedce950f8913237d6=1525702344; bdshare_firstime=1525702344314; Hm_lvt_fc0f20ff478c08368cdbb875241c3fe8=1525702344; Hm_lpvt_fc0f20ff478c08368cdbb875241c3fe8=1525702344; safedog-flow-item=8328FEE3689468599F693C625BEDF947; UM_distinctid=1633af23bc1b-02a69eb9c5ba15-38401c1f-e82f7-1633af23bc2159'
}
#结果文件保存路径
directory = "C:\\Temp\\"
#b'\xe5\xba\x97\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80'="店铺名称\t地址".encode()
table_head = b'\xe5\xba\x97\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80'
#结果列表
content = [table_head.decode() + '\n']
def getShops(list_shop):
'''
获取单个页面的所有商铺
'''
for shop_index in range(len(list_shop)):
list_dt = list_shop[shop_index].find_all("dt")
list_p = list_shop[shop_index].find_all("p")
shop = list_dt[0].text
address = list_p[0].text
content.append('\t'.join((shop , address)) + '\n')
def getAllShops():
'''
获取所有的店铺信息
'''
#urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象。
opener = urllib.request.build_opener(urllib.request.HTTPHandler)
#安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
url = "http://www.carpoly.com.cn/Decorative/Service/Shop_list.aspx"
print(url)
#发送网络GET请求,获取商铺信息
r = requests.get(url, headers = result_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
#得到网络响应
html = r.text
bsObj = BeautifulSoup(html,"html.parser")
list_link = bsObj.find("div","page").find_all("span")
pages = int(list_link[len(list_link) - 2].text)
list_shop = bsObj.find("div","xianshi_dz").find_all("dl")
getShops(list_shop)
#得到分页的商铺信息
for page_index in range(2,pages + 1):
url = "http://www.carpoly.com.cn/Decorative/Service/Shop_list.aspx?currentPage=" + str(page_index) + "&pageSize=10"
print(url)
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络GET请求,获取商铺信息
r = requests.get(url, headers = result_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
html_data = r.text
ship_bsObj = BeautifulSoup(html_data,"html.parser")
list_shop = bsObj.find("div","xianshi_dz").find_all("dl")
getShops(list_shop)
def outputResultFile():
'''
输出为结果文件
'''
path_name = directory + '\\fendian_jiabaoli.txt'
with open(path_name, 'w', encoding='utf-8') as file:
#循环遍历列表content
for index in range(len(content)):
file.write(content[index])
def start_crawl():
'''
爬虫主程序
'''
#获取所有的店铺信息
getAllShops()
#输出为结果文件
outputResultFile()
#完成
print('Complete!')
#爬虫主程序
start_crawl()