#!/usr/bin/env python
# -*- coding:utf-8 -*-
库
import urllib.error
import bs4
from urllib import request
获取一个get请求
response = urllib.request.urlopen("https://www.baidu.com")
print(response.read().decode('utf-8'))## 对网页用uft-8解码
获取一个post请求
import urllib.parse
把请求封装成二进制
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding='utf-8')
发送请求 模拟用户登录
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode('utf_8'))
超时
try:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
print(response.read().decode('utf_8'))
except urllib.request.URLError as e:
print("time out")
伪装成浏览器
- 打开浏览器,搜索网页
- 打开浏览器–检查–Network–刷新页面
- 在header栏目下找到User-Agent项
- 复制User-Agent后的内容到代码中
url = "https://www.douban.com"
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
data = bytes(urllib.parse.urlencode({"name":"chenhy"}),encoding='utf-8')
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
例子
url = "https://movie.douban.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
## 请求的东西
req = urllib.request.Request(url=url,headers=headers)
## 相应的东西
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
beautifulsoup提取元素
用beautifulsoup提取出我们想找的元素,就是每个搜索结果的URL。先把得到的内容传入beautifulsoup
soup = bs4.BeautifulSoup(content)
#然后找出URL所在的位置,在浏览器中检查元素,链接放在在h3元素下的a元素里:
linkElems = soup.select('h3 > a')
for i in range(10):
print(linkElems[i].get('href'))