爬虫之urllib库以及requests库的使用说明

最新推荐文章于 2024-07-18 19:48:40 发布

harden007

最新推荐文章于 2024-07-18 19:48:40 发布

阅读量2.6k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/harden007/article/details/80305754

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

import urllib.request
#######一 urllib库的使用#####
# 里面有三个常用的模板，分别是requeat，error，parse.,其中resqust里面就有urlopen，以及Resqust
'''
#课本p103，最简单的通过get方式爬取网页的方法，就是使用urlopen方法
response=urllib.request.urlopen('https://nba.hupu.com/',timeout=5)   #可以在后面设置超时的时间设定
print(response.read().decode('utf-8'))

#这里使用Request方法
url=("https://nba.hupu.com/")
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
print(response.status)   #查看响应的状态码
html=(response.read().decode('utf-8'))
print(html)
print(type(html))   #查看html的类型


from urllib import request,parse
url="https://httpbin.org/post"
#构建一个请求头
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"Host":"httpbin.org"
}
dict={
    "name":"harden"
}
data=bytes(parse.urlencode(dict),encoding='utf8')     #输入的数据需要被转码成bytes（字节流）
req=request.Request(url,headers=headers,data=data,method='POST')
response=request.urlopen(req)
html=(response.read().decode('utf8'))
print(html)



#使用代理p110
from urllib.request import ProxyHandler,build_opener
from urllib.error import URLError

proxy_handler=ProxyHandler({
    'http':'http;//127.0.0.1:9743',
    'https':'https://127.0.0.1:9743'
})
opener=build_opener(proxy_handler)
try:
    response=opener.open("https://www.baidu.com/")
    print(response.read().decode('utf8'))
except URLError as e:
    print(e.reason)


#cookies的处理方法
import http.cookiejar,urllib.request

cookie=http.cookiejar.CookieJar()
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('https://www.baidu.com')
#print(response.read().decode('utf8'))
for item in cookie:
    print(item.name+"="+item.value)
'''

# parse模板的使用
'''

#urlparse的使用，解析url用的
from urllib.parse import urlparse
a=urlparse('https://bbs.hupu.com/22236262.html')
for i in range(4):
    print(a[i])

'''



#######requests库的使用#####  注意不是request
''''
import requests
a=requests.get('https://bbs.hupu.com/22236262.html')
print(a.text)
print(a.cookies)
'''


#下面爬取知乎发现页面的信息
'''
import requests
import re
#
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
r=requests.get('https://www.zhihu.com/explore',headers=headers)
print(r.text)
print(type(r.text))
'''

#获取cooki  p130
'''import requests
r=requests.get("https://www.baidu.com")
#print(r.text)
print(r.cookies)
print(type(r.cookies))
'''

'''
#让cookies维持登录的状态   p131
import requests
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    'cookie':'q_c1=0393d4a0ae6245cba2d113da4fc9c11b|1526092090000|1526092090000; _zap=3a66108d-04da-47b0-9ecb-b05ad8dc1828; _xsrf=c2e55ac7-29d3-4b89-b244-fed51770e9d2; l_n_c=1; n_c=1; l_cap_id="ZWMxMTkzYTY0ZDM0NGNiM2E2ODMwNjgwMjZjZjZhY2U=|1526176928|e31768819b294ddb8aa9524ee3c270bef243056c"; r_cap_id="MTc4Y2NhYTA1Y2RmNDQ3MzhjM2M4ZDdmYTMxNjQ0MGM=|1526176928|326b63ff464f2fa461bc7e1eb824d93cf74065f9"; cap_id="YWYxZWEyZWI0MzA0NDkyZDhkNzgyZGEwYjRlMDhjMjM=|1526176928|88d28b14bcc26d12d6a4d042521f9ab8adc468bf"; d_c0="AFAhh6LilQ2PTlJZy_n_4rERqeJpKtKvHuU=|1526176931"; __utma=51854390.1977582498.1526176934.1526176934.1526176934.1; __utmc=51854390; __utmz=51854390.1526176934.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.000--|3=entry_date=20180512=1; tgw_l7_route=e0a07617c1a38385364125951b19eef8; capsion_ticket="2|1:0|10:1526182354|14:capsion_ticket|44:MTg4MmE0Zjk1NmI4NDdmOWE4ZjI4NjE4ZDA4NGRlNjI=|0fe59b0b4e6611cf1a99851637cc93ceecd62267aeab0ec18b2c05ebe23c5338"; z_c0="2|1:0|10:1526182363|4:z_c0|92:Mi4xSGZsV0F3QUFBQUFBVUNHSG91S1ZEU1lBQUFCZ0FsVk4yX19rV3dCLTFSQjRVSTh4YkhTeFU2SHI5RHpYd05UYXNn|cf51057a878911843149efceeb32a706ca6588e6eb7ed00459f65dab38a21b3b"'
}

r=requests.get('https://www.zhihu.com/',headers=headers)
print(r.text)
'''

#测试ssl的证书验证问题
import requests
a=requests.get("https://music.douban.com/chart")
print(a.text)