涉及
1、urllib 中的 request
2、requests
3、lxml 中的 etree
4、bs4 中的 BeautifulSoup
一、定制请求头
1、urllib 中的 request
# -*- coding=utf-8 -*-
from urllib import request
url = 'http://example.webscraping.com'
# 默认user-agent
# req = request.Request(url)
# 制定user-agent
headers = {'User_agent':'hahahah'}
req = request.Request(url,headers=headers)
user_agent = req.get_header('User_agent')
print(req.headers)
2、requests
# -*- coding=utf-8 -*-
import requests
def main():
# 定制请求头
url = 'http://www.baidu.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
r = requests.get(url, headers