import urllib.request
1.开启页面,抓取内容
url='https://movie.douban.com/top250'
response = urllib.request.urlopen(url)
# 直接访问豆瓣电影top250页面会报错,因为豆瓣的服务器做了反爬措施,一旦识别到访问的对象不是浏览器就会拒绝访问,并返回一个HTTP ERROR 418
2.将爬虫伪装成浏览器
需要用到:Connection、User-Agent ,获取方法:
打开自己想要爬取的网站获取:
2.1 构建一个字典形式的请求对象
db_headers={
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
2.2 开启页面抓取内容
# 2.2 构建一个包含请求头的Request对象
db_request=urllib.request.Request(url,headers=db_headers)
# 2.3 用urllib.request.urlopen方法开启带有请求头的Request对象
db_response=urllib.request.urlopen(db_request)
3.输出抓取到的内容
print(db_response.read().decode('utf-8'))
# 用response对象的read方法返回页面源文件内容,
#注意read方法返回的是bytes,需要用decode转换为字符串,
#这里要特别注意编码的规范
3.通过random模块实现随机请求头
3.1载入库
import urllib.request
import random
3.2 构建一个返回随机请求头的函数
def random_headers():
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"]
random_header ={
'Connection':'keep-alive',
'User-Agent': random.choice(user_agent)
}
return random_header
3.3开启页面,抓取内容
url='https://movie.douban.com/top250'
db_request=urllib.request.Request(url,headers=random_headers())
db_response=urllib.request.urlopen(db_request)
3.4 输出页面内容
print(db_response.read().decode('utf-8'))