#! python3
# coding:utf-8
import urllib
import requests
from bs4 import BeautifulSoup
from time import sleep
#爬取搜索结果页面的header
result_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'PHPSESSID=f1hhe1n10b735d59vcomgdql91; Hm_lvt_af55d26623d35566407534f69e434677=1525910281; Hm_lpvt_af55d26623d35566407534f69e434677=1525910281; __v3_c_sesslist_11210=f0w1i6y24h_dmk; __v3_c_pv_11210=1; __v3_c_session_11210=1525910280117569; __v3_c_today_11210=1; __v3_c_review_11210=0; __v3_c_last_11210=1525910280117; __v3_c_visitor=1525910280117569; __v3_c_session_at_11210=1525910292142'
}
#结果文件保存路径
directory = "C:\\Temp\\"
#结果列表
content = []
def getShops(list_shop):
'''
获取单个页面的所有商铺
'''
for shop_index in range(len(list_shop)):
list_td = list_shop[shop_index].find_all("td")
shop = list_td[0].text
province = list_td[1].text
city = list_td[2].text
state = list_td[3].text
address = list_td[4].text
identification = list_td[5].text
content.append(province + '\t' + city + '\t' + state + '\t' + shop + '\t' + address + '\t' + identification + '\n')
def getAllShops():
'''
获取所有的店铺信息
'''
#urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象。
opener = urllib.request.build_opener(urllib.request.HTTPHandler)
#安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
#如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络请求到三棵树漆的专卖店搜索页面
r = requests.get("http://www.skshu.com.cn/Agency/result.html?province_id=&city_id=&keyword=", headers = result_headers)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
#得到网络响应,获取三棵树漆专卖店搜索页面的主页内容
html = r.text
bsObj = BeautifulSoup(html,"html.parser")
pages = int(bsObj.find(href="/Agency/ajax_lists.html").attrs["data-total-page"])
list_shop = bsObj.find(id="content-list").find_all("tr")
getShops(list_shop)
#得到分页的商铺信息
for page_index in range(2,pages + 1):
payload = {'p': page_index}
print(page_index)
r = requests.post("http://www.skshu.com.cn/Agency/ajax_lists.html", data=payload,headers = result_headers)
html_data = r.text
ship_bsObj = BeautifulSoup(html_data,"html.parser")
list_shop = ship_bsObj.find_all("tr")
getShops(list_shop)
def outputResultFile():
'''
输出为结果文件
'''
path_name = directory + '\\fendian_sankeshu.txt'
with open(path_name, 'w', encoding='utf-8') as file:
#循环遍历列表content
for index in range(len(content)):
file.write(content[index])
def start_crawl():
'''
爬虫主程序
'''
#获取所有的店铺信息
getAllShops()
#输出为结果文件
outputResultFile()
#完成
print('Complete!')
#爬虫主程序
start_crawl()
Python 3 爬虫获取三棵树漆的分店信息
最新推荐文章于 2023-10-12 22:30:00 发布