python urllib库、requests库

最新推荐文章于 2024-11-03 07:57:46 发布

LLLLLLLLLLLLIU

最新推荐文章于 2024-11-03 07:57:46 发布

阅读量293

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/weixin_44851971/article/details/109411721

版权

Python 专栏收录该内容

45 篇文章 8 订阅

订阅专栏

urllib库

import urllib.request
import re
from 爬虫不同的库发出请求.usergent import usergent


#将useragent封装在一个模块，调用usergent函数就可以获得一个随机的useragent
header = {"User-Agent":usergent()}
url = r"http://www.youdao.com/"

#Request此类是URL请求的抽象，变量headers的值为字典
urr = urllib.request.Request(url,headers=header)
#print(urllib.request.urlopen(url))

#创建了自定义的opener
httphandle = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(httphandle)
response  = opener.open(url).read().decode()

'''
#定义opener为全局，那么发送请求时，
使用urlopen时都会默认使用自定义opener
urllib.request.install_opener(opener)
response  = urllib.request.urlopen(urr).read().decode()
'''

#返回一个http响应对象后读取、解码获得html代码
#response = urllib.request.urlopen(urr).read().decode()
#print(response)
#返回的是一个字符串格式的html内容

#清洗数据(正则表达式)
#在获取的数据中，只需要网页的标签
str = r"<title>(.*?)</title>"
str1 = re.findall(str,response)
print(str1)

requests库

import requests
from 爬虫不同的库发出请求.usergent import usergent

#get请求，参数有网址(url)、网址的补充部分(params)、请求头部分(headers)等，
#useragent、cookie放在请求头部分
#其中params是对url的完善
'''

url = "https://zhengzhou.anjuke.com/sale/?" #安居客网
header = {"User-Agent":usergent()}
#定义字典wd
wd = {"kw":"和润林湖美景"}

#https://zhengzhou.anjuke.com/sale?
response = requests.get(url, params=wd, headers = header)
try:
    print(response.text)
except ConnectionError:
    print("停止输出")
'''

#post请求 向服务器发送数据，参数有网址(url)、要发送的数据(data)等
'''
url = "https://www.iqianyue.com/mypost"
#向服务器发送的数据
formdata = {"name":"lau","pass":"1234567"}
res = requests.post(url,data=formdata)
print(res.text)
'''


#模拟cookie登录
#cookie在浏览器请求头中
#如果cookie失效，需要重新获取
'''
url = "https://www.douban.com/" #豆瓣网
params = {}
header = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
    "Cookie":'bid=9NeBIpRMcgA; douban-fav-remind=1; __yadk_uid=3iD14p0fcxvP0n6iCkuDlDT9hepomKUQ; __gads=ID=0e83425bd55684c3:T=1\
    599739192:S=ALNI_MYtlY7GlAQOEDr2e4gdIpCfbiy-rA; __utmz=30149280.1601262569.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic;\
    ll="118163"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22511; __utma=30149280.1710552684.1599739191.1602768951.16\
    02826779.4; __utmc=30149280; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1603017305%2C%22https%3A%2F%2Fwww.baidu.com%\
    2Flink%3Furl%3DogJfE080UPFVsLV17_aBgVbxeps-23zwz0h0njmmCXTsaH-hhy3TRISY3yGBeIvrXwD2gLlw3fmXVoR5b_Exsq%26wd%3D%26eqi\
    d%3Da18b53850000683e000000055f71537b%22%5D; _pk_ses.100001.8cb4=*; ap_v=0,6.0; dbcl2="225112613:1oJIKQB3pRQ"; ck=NKF4; _pk_id.\
    100001.8cb4=45ef19b90fa8e5d7.1599739191.5.1603017428.1602826794.'}
res = requests.get(url,headers=header)
print(res.text)
'''


#代码获取cookies
'''
header = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64)\
 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
          }
url = "https://www.douban.com/" #豆瓣网
#1.发出请求 获得服务器响应
res = requests.get(url,headers=header)
#2.响应对象调用cookies，返回cookiejar对象
cookiesjar = res.cookies  
#3.将cookiejar对象转化为字典
cookiesdic = requests.utils.dict_from_cookiejar(cookiesjar)
#得到是一个字典，cookie的name为键，value为值
print(cookiesdic)
'''

#使用session进行访问
'''
#豆瓣登录问题：https://fishc.com.cn/thread-169934-1-1.html
#先登录输入错误的账号和密码，在network中找到post请求网址和需要提交的参数
url = "https://accounts.douban.com/j/mobile/login/basic"
header = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
#创建session对象
session = requests.session()
#登录所需要的参数
data = {"ck":"",
        "remember":"true",
       "name":"15895885470",
        "password":"pn13986737527"}
#提交数据获取cookie
#先用get请求一次，再进行传参
res1 = session.get(url=url,headers=header)
res2 = session.post(url=url,data=data,headers=header)
print(res2.text)
#session再次请求
res3 = session.get(url)
print(res3.text)
#多次请求会有验证码，过段时间再提交
'''