# -*- coding = utf-8 -*-
# @Time : 2022/7/25 19:19
# @Author : khm
# @File : testurllib.py
# @Software : PyCharm
import urllib.request
import urllib.parse
import codecs
'''
# 获取一个get请求
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8")) # 对获取到的网页源码进行utf-8解码
'''
'''
# 获取一个post请求
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(response.read().decode("utf-8")) # 对获取到的网页源码进行utf-8解码
'''
'''
# 超时处理
try:
response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01)
print(response.read().decode("utf-8")) # 对获取到的网页源码进行utf-8解码
except urllib.error.URLError as e:
print("time out!")
'''
'''
try:
response = urllib.request.urlopen("http://www.baidu.com", timeout=5)
print(response.status) # 对获取到的网页源码进行utf-8解码
print(response.getheaders()) # 对获取到的网页源码进行utf-8解码
except Exception as e:
print(e)
'''
# 伪装自己是人
# 爬取首页并写入文件
url = "http://www.douban.com"
data = bytes(urllib.parse.urlencode({"username": "khm"}).encode("utf-8"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req, timeout=5)
# print(response.read().decode("utf-8"))
f = open('douban_index.html', mode="w", encoding="utf-8")
f.write(response.read().decode("utf-8"))
f.close()
Python基础获取页面源规则
于 2022-07-25 20:03:41 首次发布