import requests
req = requests.get('https://www.bilibili.com/')
print(req.content.decode('utf-8'))
import urllib.request
req = urllib.request.urlopen('https://www.bilibili.com/')
print(req.read().decode('utf-8'))
调到代码段部分
使用post,到时候加密码之后用
import urllib.request
# req = urllib.request.urlopen('https://www.bilibili.com/')
# print(req.read().decode('utf-8'))
import urllib.parse
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen(r"http://www.httpbin.org/post", data=data)
print(response.read().decode("utf-8"))
try:
response = urllib.request.urlopen("http://www.httpbin.org/get",timeout = 0.1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out!")
设置访问时间,多长时间没访问成功就跳过
response = urllib.request.urlopen("http://www.httpbin.org/get")
print(response.status)
200成功访问,404页面不存在,405页面禁止访问,418被发现是个爬虫了
response = urllib.request.urlopen("http://www.httpbin.org/get")
print(response.getheaders())
response = urllib.request.urlopen("http://www.httpbin.org/get")
print(response.getheader("Server"))
find_all()
字符串过滤:查找与字符串完全匹配的内容
使用search来匹配内容
from bs4 import BeautifulSoup
import re
file = open("./1234.html", "rb")
html = file.read()
bs = BeautifulSoup(html,"html.parser")
t_list = bs.find_all(re.compile("a"))
print(t_list)
里面只要有a就被引用出来
kwargs参数
什么class,a都可以加
text参数
limit参数
限制输入几个
通过类名来查找东西.class
通过标签来查找东西#id
在标签里面寻找
中间是兄弟标签
只要.mnav属性平级的.bri