流程
- ua = random.choice(ua_list) #随机选择一个user-agent信息
- request = Request(url) #传奇请求对象
- request.add_header(‘User-Agent’, ua) #添加请求头信息
- response.read() # 读取返回的内容
#有些网站是反爬虫的,所以要把爬虫伪装成浏览器。随便打开一个浏览器,复制浏览器的UA值,用来伪装
from urllib.request import Request, urlopen
import random
url = 'http://www.bing.com/'
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/57.0.2987.133 Safari/537.36",# chrome
"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN) AppleWebKit/537.36 (KHTML, like Gecko)
Version/5.0.1 Safari/537.36", # safafi
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0", # Firefox
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" # IE
]
ua = random.choice(ua_list) #随机选择一个user-agent信息
request = Request(url) #传奇请求对象
request.add_header('User-Agent', ua) #添加请求头信息
with response:
print(1, response.status, response.getcode(), response.reason) # 状态,getcode本质上就是返回
status
print(2, response.geturl()) # 返回数据的url。如果重定向,这个url和原始url不一样
# 例如原始url是http://www.bing.com/,返回http://cn.bing.com/
print(3, response.info()) # 返回响应头headers
print(4, response.read()) # 读取返回的内容
print(5, request.get_header('User-agent'))
print(6, request.headers)
print(7, 'user-agent'.capitalize())
cmd = input('>>>')
end_url2 = urllib.parse.quote('https://www.baidu.com/s?wd={}'.format(cmd), safe=string.printable)
print(end_url2,'!!!!!!!!!!!!!!!!')
res = request.urlopen(end_url2)
data = res.read().decode()
print(data)
url2 = 'https://www.baidu.com/s?'
str_params2 = urllib.parse.urlencode({'wd': '中文'})
print(str_params2, '~~~~~~~~~~~~~~~')
print(url2+str_params2, '+++++++++++')
url = 'https://www.baidu.com/s?wd='
name = '中国'
final_url = url + name
print(final_url)
#https://www.baidu.com/s?wd=%E4%BA%BA%E5%B7%A5
from urllib import request
import urllib.parse
import string
response = request.urlopen(url)
print(response)
print(response.read())
new_url = urllib.parse.quote(final_url, safe=string.printable)
print(new_url)
res = request.urlopen(new_url)
print(res.read())