爬虫urllib打开网址:
from urllib import request
url = 'https://olympics.com/zh/olympic-games/tokyo-2020/athletes'
headers ={'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
req = request.Request(url = url, headers = headers) #添加请求头
response = request.urlopen(req)
print(response.read().decode('utf-8')) #read返回网页内容
print(response.status) #状态码 200正常访问,404请求失败
超时设置,错误抓取
添加timeout参数,try except断言方法抓取错误
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('https://github.com/',timeout = 5)
except urllib.error.URLError as e :
if isinstance(e.reason,socket.timeout):
print('请求超时')
拆分拼接url
urlparse和urlunparse
#拆分
from urllib.parse import urlparse
s = urlparse('https://olympics.com/zh/olympic-games/tokyo-2020/athletes')
print(type(s),s)
#构造
from urllib.parse import urlunparse
data = ['https','olympics.com','/zh/olympic-games/tokyo-2020/athletes','','','']
print(urlunparse(data))
网页链接由协议,域名,访问路径,参数,查询条件,描点组成,网页链接也要由这六部分组成。
还有urlsplit,urlunsplit,urljoin,urlencode方法
get请求:
requests库发送get请求,post请求同理
import requests
url = 'http://httpbin.org/get'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
}
response = requests.get(url=url,headers=headers)
print(response.status_code) #状态码
print(response.text) #内容