两种方式爬取:Get和Post
主要用urllib.request 包的Request(请求)和urlopen(访问)两个方法
from urllib.request import Request,urlopen
#一个Get请求
url = 'http://www.douban.com'
#设置请求头是模拟浏览器访问反爬虫的网址
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
res = urlopen(Request(url, headers=headers))
print(res.read().decode('utf-8')) #read读取的是GBK格式,要设置utf-8
from urllib.request import Request,urlopen
import urllib.parse
#一个Post请求
#这是一个测试Post请求的网址,Post请求需要携带表单数据才能正常访问
url = "http://httpbin.org/post"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
urlencode = urllib.parse.urlencode({"hello": "world"}) #将diect类型数据转换成url表单键值对格式
bytes = bytes(urlencode,encoding="utf-8") #将字符串转化为byte类型
Request = Request(url, headers=headers, data=bytes, method="POST") #data必须是一个bytes的类型
urlopen1 = urlopen(Request,timeout=2) #urlopen方法参数可以是Request对象,也可以是url等其他参数,timeout超时时间2秒
print(urlopen1.read().decode("utf-8"))
代理爬虫
urllib方式
from urllib.request import Request,urlopen,ProxyHandler,build_opener
#urllib测试代理
url = "http://httpbin.org/get"
headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
request = Request(url=url,headers=headers)
#设置Handler代理ip地址
httpproxy_handler = ProxyHandler({"http" : "122.51.231.113:8080",
"https" : "122.51.231.113:8080"})
#用handler代理对象构建opener对象
opener = build_opener(httpproxy_handler)
#使用opener的open方法进行网页访问
response = opener.open(request)
print(response.read().decode('utf-8'))
requests方式
import requests
#requests 请求方式
headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
#设代理
proxies={"http" : "http://122.51.231.113:8080",
"https" : "https://122.51.231.113:8080"}
get = requests.get("http://www.baidu.com",headers,proxies=proxies).text
print(get)
下载文件(图片)
import requests
import os
get = requests.get('https://img0.baidu.com/it/u=1986451467,394304688&fm=26&fmt=auto&gp=0.jpg')
os.mkdir('jpg') #创建目录
with open('jpg/hello.jpg','wb') as o:
o.write(get.content) #将图片写入
print('下载完成')
print("hello,python")