python爬虫起步
urllib模块
import urllib.request
获取url
content = urllib.request.urlopen(url).read()#获取网页
content = content.decode('utf-8')
print(content)#
设置用户代理(爬取一些需要登陆的网站时)
headers = {
'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
'Connection':'close',
'referer': 'https://www.baidu.com',
'User-Agent':'Mozilla/5.0 (Linux; Android 10; HMA-AL00; HMSCore 5.3.0.312; GMSCore 20.15.16) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.93 HuaweiBrowser/11.1.2.301 Mobile Safari/537.36',
'Upgrade-Insecure-Requests':'1',
'Cache-Control':'max-age=0',
'cookie':'cookie=abc;'
}
req = urllib.request.Request(url=url,headers=headers)#设置用户代理后进行请求
content = urllib.request.urlopen(req).read()