"""
为对付反爬,有几个重要的手段,此处采用了随机ua和随机睡眠;还有代理IP等手段。
"""
import urllib.request
import urllib.parse
from fake_useragent import UserAgent
from time import sleep
from random import randint
from tqdm import tqdm
def create_request(page_index: int):
url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
ua = UserAgent() # 生成一个ua对象
headers = {
'User-Agent': ua.random,
'Cookie': 'route-cell=ksa; ASP.NET_SessionId=jh3ulkzmpqukae5k5b5ya3n4; VOLCALB=839681b35f197b4ed33d4bc5335bdf66|1703947393|1703947080; VOLCALBCORS=839681b35f197b4ed33d4bc5335bdf66|1703947393|1703947080'
}
data = {
'cname': '北京',
'pid': None,
'pageIndex': page_index,
'pageSize': 10
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url, headers=headers, data=data)
return request
def get_content(input_request):
response = urllib.request.urlopen(input_request)
_content = response.read().decode('utf-8')
return _content
def download_resource(download_content):
with open('kfc_Beijing.json', 'a+', encoding='utf-8') as f:
f.write(download_content)
if __name__ == '__main__':
final_page = int(input('请输入需要爬取的最后一页:'))
for page in tqdm(range(1, final_page+1)):
requests = create_request(page)
content = get_content(requests)
sleep(randint(2, 5)) # 随机睡眠几秒,防止访问过于频繁给服务器造成太大的负担
download_resource(content)