Python基础获取页面源规则

康梓潼

已于 2022-08-01 08:37:54 修改

阅读量102

点赞数

分类专栏： python 文章标签： python 开发语言

于 2022-07-25 20:03:41 首次发布

本文链接：https://blog.csdn.net/weixin_46453221/article/details/125982333

版权

python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

# -*- coding = utf-8 -*-
# @Time : 2022/7/25 19:19
# @Author : khm
# @File : testurllib.py
# @Software : PyCharm
import urllib.request
import urllib.parse
import codecs

'''
# 获取一个get请求
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))  # 对获取到的网页源码进行utf-8解码
'''
'''
# 获取一个post请求
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(response.read().decode("utf-8"))  # 对获取到的网页源码进行utf-8解码
'''
'''
# 超时处理
try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01)
    print(response.read().decode("utf-8"))  # 对获取到的网页源码进行utf-8解码
except urllib.error.URLError as e:
    print("time out!")
'''
'''
try:
    response = urllib.request.urlopen("http://www.baidu.com", timeout=5)
    print(response.status)  # 对获取到的网页源码进行utf-8解码
    print(response.getheaders())  # 对获取到的网页源码进行utf-8解码
except Exception as e:
    print(e)
'''
# 伪装自己是人
# 爬取首页并写入文件
url = "http://www.douban.com"
data = bytes(urllib.parse.urlencode({"username": "khm"}).encode("utf-8"))
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req, timeout=5)
# print(response.read().decode("utf-8"))
f = open('douban_index.html', mode="w", encoding="utf-8")
f.write(response.read().decode("utf-8"))
f.close()