#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2019/9/4 17:04
# @Auther : Frank Hu
# @Email : 1171624400@qq.com
# @File : advanced_link_crawler_using_requests.py
# @Software: PyCharm
from urllib import robotparser
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
import re
import time
def download(url, num_retries=2, user_agent='wswp', proxies=None):
""" 下载给定URL并返回页面内容
args:
url (str): URL
kwargs:
user_agent (str): user agent (default: wswp)
proxies (dict): proxy dict w/ keys 'http' and 'https', values
are strs (i.e. 'http(s)://IP') (default: None)
num_retries (int): # of retries if a 5xx error is seen (default: 2)
"""
print('Downloading:', url)
headers = {
'User-Agent': user_agent}
try:
resp = requests.get(url, headers=headers, proxies=proxies)
html = resp.text
if resp.status_code >= 400:
advanced_link_crawler_using_requests.py
最新推荐文章于 2020-09-14 16:25:29 发布
本文介绍如何利用Python的requests库构建一个高级的网络爬虫,深入抓取网页链接,分析页面结构,有效处理HTTP错误和重定向。
摘要由CSDN通过智能技术生成