爬虫学习第二天

最新推荐文章于 2024-09-17 23:15:58 发布

Jo乔戈里

最新推荐文章于 2024-09-17 23:15:58 发布

阅读量687

点赞数 8

文章标签： python

本文链接：https://blog.csdn.net/a494665/article/details/135311477

版权

"""
为对付反爬，有几个重要的手段，此处采用了随机ua和随机睡眠；还有代理IP等手段。
"""
import urllib.request
import urllib.parse
from fake_useragent import UserAgent
from time import sleep
from random import randint
from tqdm import tqdm


def create_request(page_index: int):
    url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
    ua = UserAgent()    # 生成一个ua对象
    headers = {
        'User-Agent': ua.random,
        'Cookie': 'route-cell=ksa; ASP.NET_SessionId=jh3ulkzmpqukae5k5b5ya3n4; VOLCALB=839681b35f197b4ed33d4bc5335bdf66|1703947393|1703947080; VOLCALBCORS=839681b35f197b4ed33d4bc5335bdf66|1703947393|1703947080'
    }
    data = {
        'cname': '北京',
        'pid': None,
        'pageIndex': page_index,
        'pageSize': 10
    }
    data = urllib.parse.urlencode(data).encode('utf-8')
    request = urllib.request.Request(url=url, headers=headers, data=data)
    return request


def get_content(input_request):
    response = urllib.request.urlopen(input_request)
    _content = response.read().decode('utf-8')
    return _content


def download_resource(download_content):
    with open('kfc_Beijing.json', 'a+', encoding='utf-8') as f:
        f.write(download_content)


if __name__ == '__main__':
    final_page = int(input('请输入需要爬取的最后一页：'))
    for page in tqdm(range(1, final_page+1)):
        requests = create_request(page)
        content = get_content(requests)
        sleep(randint(2, 5))   # 随机睡眠几秒，防止访问过于频繁给服务器造成太大的负担
        download_resource(content)